def train_model(data, topic, PROCESSED_DIR, SEED_FOLDER, **kwargs):
    dropout = kwargs['model_settings']["dropout"]
    lstm_size = kwargs['model_settings']["lstm_size"]
    monitor = kwargs['model_settings']["monitor"]
    batch_size = kwargs['model_settings']["batch_size"]
    epochs = kwargs['model_settings']["epochs"]
    learning_rate = kwargs['model_settings']["learning_rate"]
    train_embeddings = kwargs['model_settings']["train_embeddings"]
    return_probs = False
    return_model = False
    model_file = SEED_FOLDER+topic+"_"+kwargs['model_settings']["model_file_suffix"]
    seed = kwargs['model_settings']['current_seed']

    # set reproducibility
    # set configs for memory usage and reproducibility: https://stackoverflow.com/questions/38469632/tensorflow-non-repeatable-results
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    rn.seed(seed)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = False
    config.gpu_options.per_process_gpu_memory_fraction = 0.3
    np.random.seed(seed)
    graph_level_seed = 1
    operation_level_seed = 1
    tf.set_random_seed(graph_level_seed)
    sess = tf.Session(config=config)
    K.set_session(sess)

    # load vocab we and get indices for topic
    vocab_we = load_from_pickle(PROCESSED_DIR+"vocab_we.pkl")

    # load word embeddings
    embeddings_lookup = np.load(PROCESSED_DIR + "index_to_vec_we"+kwargs['model_settings']['word_embeddings'][1]+".npy")

    # load data
    X_train, X_dev, X_test = data["X_train"], data["X_dev"], data["X_test"]
    y_train, y_dev, y_test = data["y_train"], data["y_dev"], data["y_test"]
    
    # generate topic data
    data['X_topic_train'] = [get_avg_embedding(topic.split('_'), embeddings_lookup, vocab_we)] * len(data['X_train'])
    data['X_topic_dev'] = [get_avg_embedding(topic.split('_'), embeddings_lookup, vocab_we)] * len(data['X_dev'])
    data['X_topic_test'] = [get_avg_embedding(topic.split('_'), embeddings_lookup, vocab_we)] * len(data['X_test'])
    
    X_topic_train, X_topic_dev, X_topic_test = data["X_topic_train"], data["X_topic_dev"], data["X_topic_test"]
    
    # some constants
    sent_len = X_train.shape[1]
    num_labels = y_train.shape[1]

    sentence_input = Input(shape=(sent_len,), dtype='int32', name="text_input")
    gate_vector_input = Input(shape=(300,), dtype='float32', name="gate_vectors_each_sentence")
    embedded_layer = Embedding(embeddings_lookup.shape[0], embeddings_lookup.shape[1], mask_zero=True,
                               trainable=train_embeddings, input_length=sent_len,
                               weights=[embeddings_lookup])(sentence_input)


    bilstm_layer = Bidirectional(custom_LSTM_fo(lstm_size))([embedded_layer, gate_vector_input])


    dropout_layer = Dropout(dropout)(bilstm_layer)
    output_layer = Dense(num_labels, activation='softmax')(dropout_layer)
    model = Model(inputs=[sentence_input,gate_vector_input], output=output_layer)

    adam = Adam(lr=learning_rate)
    model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])

    #e = EarlyStopping(monitor=monitor, mode='auto')
    e = ModelCheckpoint(model_file, monitor=monitor, verbose=0, save_best_only=True, save_weights_only=True,
                        mode='auto', period=1)
    model.fit([X_train, X_topic_train], y_train, batch_size=batch_size, epochs=epochs,
              validation_data=([X_dev, X_topic_dev], y_dev), callbacks=[e], verbose=1)
    model.load_weights(model_file)

    if return_model == True:
        return model
    else:
        test_predictions = model.predict([X_test, X_topic_test], verbose=False)
        val_predictions = model.predict([X_dev, X_topic_dev], verbose=False)
        if return_probs == False:
            test_predictions = [np.argmax(pred) for pred in test_predictions]
            val_predictions = [np.argmax(pred) for pred in val_predictions]
        return test_predictions, val_predictions
Example #2
0
def train_model(data, topic, PROCESSED_DIR, SEED_FOLDER, **kwargs):
    dropout = kwargs['model_settings']["dropout"]
    lstm_size = kwargs['model_settings']["lstm_size"]
    monitor = kwargs['model_settings']["monitor"]
    batch_size = kwargs['model_settings']["batch_size"]
    epochs = kwargs['model_settings']["epochs"]
    learning_rate = kwargs['model_settings']["learning_rate"]
    train_embeddings = kwargs['model_settings']["train_embeddings"]
    # model file eg: 'results/only_sub_and_inst/model_runs/EvLSTM/seed_0/death_penalty_threelabel_crossdomain_monitor-f1_macro_do-0.3_lsize-32_bs-32_epochs-20_lr-0.001_trainemb-False_kl-only_sub_and_inst'
    model_file = SEED_FOLDER+topic+"_"+kwargs['model_settings']["model_file_suffix"]
    seed = kwargs['model_settings']['current_seed']

    # clear default graph (new model now)
    #tf.reset_default_graph()

    # set configs for memory usage and reproducibility: https://stackoverflow.com/questions/38469632/tensorflow-non-repeatable-results
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    rn.seed(seed)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = False
    config.gpu_options.per_process_gpu_memory_fraction = 0.3
    np.random.seed(seed)
    #graph_level_seed = seed
    operation_level_seed = seed
    #tf.set_random_seed(graph_level_seed)

    # load embeddings
    emb_sents = np.load(PROCESSED_DIR + "index_to_vec_we"+kwargs['model_settings']['word_embeddings'][1]+".npy")
    emb_knowledge = np.load(PROCESSED_DIR + "index_to_vec_kge"+kwargs['model_settings']['kg_embeddings'][1]+".npy")

    # load data
    X_train, X_dev, X_test = data["X_train"], data["X_dev"], data["X_test"] # [samples, sent_len]
    kX_train, kX_dev, kX_test = data["kX_train"], data["kX_dev"], data["kX_test"] # [samples, sent_len, max_concepts]
    y_train, y_dev, y_test = data["y_train"], data["y_dev"], data["y_test"]

    # load vocab we and get indices for topic
    vocab_we = load_from_pickle(PROCESSED_DIR + "vocab_we.pkl")

    # generate topic data
    X_topic_train = get_avg_embedding_for_topic_list(data['X_topic_train'], emb_sents, vocab_we)
    X_topic_dev = get_avg_embedding_for_topic_list(data['X_topic_dev'], emb_sents, vocab_we)
    X_topic_test = get_avg_embedding_for_topic_list(data['X_topic_test'], emb_sents, vocab_we)

    # some constants
    sent_len = X_train.shape[1]
    max_concepts = kX_train.shape[2]
    num_labels = y_train.shape[1]
    attention_size = kwargs['model_settings'].get('attention_size', emb_sents.shape[1])
    topic_vector_dim(emb_sents.shape[1])
    

    ############################
    #   KNOWLEDGE PROCESSING   #
    ############################

    # input for all concepts of a sentence
    sentence_inputs = Input(shape=(sent_len, ), dtype='int32', name="sentence_inputs")
    topic_vector_input = Input(shape=(emb_sents.shape[1],), dtype='float32', name="topic_vector_sent_wise")
    knowledge_inputs = Input(shape=(sent_len, max_concepts,), dtype='int32', name="knowledge_inputs")

    emb_knowledge_ids = Embedding(emb_knowledge.shape[0], emb_knowledge.shape[1], mask_zero=True,
                               weights=[emb_knowledge], trainable=train_embeddings)(knowledge_inputs) # [samples, sent_len, max_concepts, kge_dim]

    embedded_word_ids = Embedding(emb_sents.shape[0], emb_sents.shape[1], mask_zero=True,
                               weights=[emb_sents], trainable=train_embeddings,
                               input_length=sent_len)(sentence_inputs) # [samples, sent_len, we_dim]

    attended_knowledge = attention_knowledge(embedded_word_ids, topic_vector_input, attention_size, return_alphas=False, summed_up=True)(emb_knowledge_ids)

    concat_sequences = Lambda(lambda x: tf.concat([x[0], x[1]], axis=-1))([embedded_word_ids, attended_knowledge])

    # define bilstm + dropout => input_shape=(None, sent_len, attention_size+emb_sents.shape[1])
    sent_bilstm = Bidirectional(LSTM(lstm_size))(concat_sequences)
    sent_bilstm_dropout = Dropout(dropout)(sent_bilstm)

    output_layer = Dense(num_labels, activation='softmax')(sent_bilstm_dropout)

    model = Model(inputs=[sentence_inputs, knowledge_inputs, topic_vector_input], outputs=output_layer)

    adam = Adam(lr=learning_rate)
    model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])

    #e = EarlyStopping(monitor=monitor, mode='auto')
    e = ModelCheckpoint(model_file, monitor=monitor, verbose=0, save_best_only=True, save_weights_only=True,
                        mode='auto', period=1)
    model.fit([X_train, kX_train, X_topic_train], y_train, batch_size=batch_size, epochs=epochs,
              validation_data=([X_dev, kX_dev, X_topic_dev], y_dev), callbacks=[e], verbose=1)
    model.load_weights(model_file)

    y_pred_test = model.predict([X_test, kX_test, X_topic_test], verbose=False)
    y_pred_dev = model.predict([X_dev, kX_dev, X_topic_dev], verbose=False)

    return [np.argmax(pred) for pred in y_pred_test], [np.argmax(pred) for pred in y_pred_dev]

    # attention
    # initial_outputs = tf.TensorArray(dtype=tf.float32, size=sent_len, element_shape=(None, 100), infer_shape=False)
    # initial_outputs = tf.Variable([])

    # def slice_t(x):
    #   return x[:, 0,:,:]

    # def slice_zero(x):
    #    return x[:,0,:]

    # slize_t_layer = Lambda(slice_t)
    # slize_zero_layer = Lambda(slice_zero)
    """
    @autograph.convert()
    def loop(tensor):
        initial_outputs = []
        for t in range(sent_len):
            kX_sliced = tensor[:, 0,:,:] # [samples, token at t, concepts, concept_embs]
            initial_outputs.append(kX_sliced[:,0,:])
        return K.stack(initial_outputs, 0)

    layer = tf.keras.layers.Lambda(loop, output_shape=(sent_len, None, 100))
    initial_outputs = layer(concept_embedded_layer)


    initial_t = tf.constant(0)
    initial_outputs = tf.TensorArray(dtype=tf.float32, size=sent_len)
    #shit = tf.placeholder(tf.float32, [None, emb_knowledge.shape[1]])
    condition = lambda t, _: tf.less(t, sent_len)
    #body = lambda i: tf.add(i, 1)
    def body(t, outputs_):
        kX_sliced = Lambda(lambda x: x[:, t, :, :], output_shape=(None, max_concepts, 100))(concept_embedded_layer)
        kX_sliced_temp = Lambda(lambda x: x[:, 0, :], output_shape=(None, 100))(
            kX_sliced)
        outputs_ = outputs_.write(t, kX_sliced_temp) #[ [samples, kge], [samples, kge], ...] =>[sent_len, samples, kge]
        return t + 1, outputs_

    # do the loop:
    _, outputs = tf.while_loop(condition, body, loop_vars=[initial_t, initial_outputs])#,

    initial_outputs = []
    for t in range(sent_len):
        if t == 0:
            with tf.variable_scope("tf", reuse=False):
                #kX_sliced = concept_embedded_layer[:, t, :, :]  # [samples, token at t, concepts, concept_embs]
                kX_sliced = Lambda(lambda x: x[:, t, :, :], output_shape=(None, max_concepts, 100))(concept_embedded_layer)
                X_token = sent_embedded[:, t, :]
                # temp = AttentionHerrmann_2(representation_claim=X_token, only_attended_vector=True, topic_size=50,
                #                  summed_up=True, self_attention=False, name='attention',
                #                  bias_1=True)(kX_sliced)
                # temp = attention_knowledge(kX_sliced, X_token, attention_size, return_alphas=False)
                kX_sliced_temp = Lambda(lambda x: x[:, 0, :], output_shape=(None, 100))(
                    kX_sliced)
                initial_outputs.append(kX_sliced_temp)
        else:
            with tf.variable_scope("tf", reuse=True):
                #kX_sliced = concept_embedded_layer[:, t, :, :]  # [samples, token at t, concepts, concept_embs]
                kX_sliced = Lambda(lambda x: x[:, t, :, :], output_shape=(None, max_concepts, 100))(
                    concept_embedded_layer)
                X_token = sent_embedded[:, t, :]
                # temp = AttentionHerrmann_2(representation_claim=X_token, only_attended_vector=True, topic_size=50,
                #                 summed_up=True, self_attention=False, name='attention',
                #                  bias_1=True)(kX_sliced)
                # temp = attention_knowledge(kX_sliced, X_token, attention_size, return_alphas=False)
                kX_sliced_temp = Lambda(lambda x: x[:, 0, :], output_shape=(None, 100))(
                    kX_sliced)
                initial_outputs.append(kX_sliced_temp)
    initial_outputs = K.stack(initial_outputs, 0)
    attended_knowledge = K.permute_dimensions(initial_outputs, (1, 0, 2))


    #shit = tf.placeholder(tf.float32, [None, emb_knowledge.shape[1]])
    condition = lambda t, _: tf.less(t, sent_len)
    #body = lambda i: tf.add(i, 1)
    def body(t, outputs_):
        kX_sliced = concept_embedded_layer[:, t,:,:] # [samples, token at t, concepts, concept_embs]
        X_token = sent_embedded[:, t,:]
        #outputs_=outputs_[t].assign(attention_knowledge(kX_sliced, X_token, attention_size, return_alphas=False))
        outputs_ = outputs_.write(t, attention_knowledge(kX_sliced, X_token, attention_size, return_alphas=False)) #[ [samples, kge], [samples, kge], ...] =>[sent_len, samples, kge]
        return t + 1, outputs_

    # do the loop:
    _, outputs = tf.scan(body, (), loop_vars=[initial_t, initial_outputs],
                               #shape_invariants=[initial_t.get_shape(), tf.TensorShape([None, sent_len])]
                               )#,"""

    # stack = initial_outputs.stack() # [sent_len, samples, kge_dim]
    # attended_knowledge = tf.transpose(initial_outputs, [1,0,2]) #[samples, sent_len, kge_dim]
    # attended_knowledge = Lambda(lambda x: tf.transpose(x, [1,0,2]))(initial_outputs) #[samples, sent_len, kge_dim]
    # initial_outputs = Lambda(lambda x: x.stack(), output_shape=(60, None, 100))(outputs)
    # initial_outputs = initial_outputs.stack()
    # initial_outputs = K.stack(initial_outputs, 0)
Example #3
0
def train_model(data, topic, PROCESSED_DIR, SEED_FOLDER, **kwargs):
    """
    Trains bilstm on DIP2016 and UKP in an alternating fashion. Rebuilds the idea of Shared-private model from
    "Adversarial Multi-task Learning for Text Classification" by Pengfei. Also includes ideas of https://jg8610.github.io/Multi-Task/.
    Consists of 2 private models (UKP, DIP) and 1 shared model.

    Basic implementation by:
    Links:
        [Long Short Term Memory](http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf)
        [MNIST Dataset](http://yann.lecun.com/exdb/mnist/).
    Author: Aymeric Damien
    Project: https://github.com/aymericdamien/TensorFlow-Examples/
    """
    # example from https://github.com/aymericdamien/TensorFlow-Examples/blob/master/examples/3_NeuralNetworks/bidirectional_rnn.py
    dropout = kwargs['model_settings']["dropout"]
    lstm_size = kwargs['model_settings']["lstm_size"]
    monitor = kwargs['model_settings']["monitor"]
    batch_size = kwargs['model_settings']["batch_size"]
    epochs = kwargs['model_settings']["epochs"]
    learning_rate = kwargs['model_settings']["learning_rate"]
    train_embeddings = kwargs['model_settings']["train_embeddings"]
    # model file eg: 'results/only_sub_and_inst/model_runs/EvLSTM/seed_0/death_penalty_threelabel_crossdomain_monitor-f1_macro_do-0.3_lsize-32_bs-32_epochs-20_lr-0.001_trainemb-False_kl-only_sub_and_inst'
    model_file = SEED_FOLDER+topic+"_"+kwargs['model_settings']["model_file_suffix"]
    seed = kwargs['model_settings']['current_seed']

    # clear default graph (new model now)
    tf.reset_default_graph()

    # set configs for memory usage and reproducibility: https://stackoverflow.com/questions/38469632/tensorflow-non-repeatable-results
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    rn.seed(seed)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = False
    config.gpu_options.per_process_gpu_memory_fraction = 0.3
    np.random.seed(seed)
    graph_level_seed = seed
    operation_level_seed = seed
    tf.set_random_seed(graph_level_seed)

    # load vocab we and get indices for topic
    vocab_we = load_from_pickle(PROCESSED_DIR+"vocab_we.pkl")

    # load embeddings
    emb_sents = np.load(PROCESSED_DIR + "index_to_vec_we"+kwargs['model_settings']['word_embeddings'][1]+".npy")
    emb_knowledge = np.load(PROCESSED_DIR + "index_to_vec_kge"+kwargs['model_settings']['kg_embeddings'][1]+".npy")

    # load data
    X_train, X_dev, X_test = data["X_train"], data["X_dev"], data["X_test"] # [samples, sent_len]
    kX_train, kX_dev, kX_test = data["kX_train"], data["kX_dev"], data["kX_test"] # [samples, sent_len, max_concepts]
    X_topic_train, X_topic_dev, X_topic_test = get_avg_embedding_for_topics(data["X_topic_train"], emb_sents, vocab_we), \
                                               get_avg_embedding_for_topics(data["X_topic_dev"], emb_sents, vocab_we), \
                                               get_avg_embedding_for_topics(data["X_topic_test"], emb_sents, vocab_we)
    X_topic_cos_train = get_topic_token_cos(X_train, X_topic_train, emb_sents)
    X_topic_cos_dev = get_topic_token_cos(X_dev, X_topic_dev, emb_sents)
    X_topic_cos_test = get_topic_token_cos(X_test, X_topic_test, emb_sents)
    y_train, y_dev, y_test = data["y_train"], data["y_dev"], data["y_test"]
    val_y_non_one_hot = [np.argmax(pred) for pred in y_dev]

    # some constants
    sent_len = X_train.shape[1]
    max_concepts = kX_train.shape[2]
    num_labels = y_train.shape[1]
    attention_size = kwargs['model_settings'].get('attention_size', emb_sents.shape[1])

    # calculate how often the model has to run for 1 epoch, given the batch_size
    periods_ukp = int(len(X_train) / batch_size)
    periods_ukp_val = int(len(X_dev) / batch_size)

    # tf Graph input
    X = tf.placeholder(tf.int32, [None, sent_len])
    X_topic = tf.placeholder(tf.float32, [None, emb_sents.shape[1]])
    X_topic_cos = tf.placeholder(tf.float32, [None, sent_len, 1])
    KX = tf.placeholder(tf.int32, [None, sent_len, max_concepts])
    Y = tf.placeholder(tf.float32, [None, num_labels])
    EMB_SENTS = tf.placeholder(tf.float32, [emb_sents.shape[0], emb_sents.shape[1]]) #https://stackoverflow.com/questions/35687678/using-a-pre-trained-word-embedding-word2vec-or-glove-in-tensorflow
    EMB_KNOWLEDGE = tf.placeholder(tf.float32, [emb_knowledge.shape[0], emb_knowledge.shape[1]]) #https://stackoverflow.com/questions/35687678/using-a-pre-trained-word-embedding-word2vec-or-glove-in-tensorflow
    dropout_const = tf.placeholder(tf.float32)

    # Define weights
    weights = {
        # Hidden layer weights => 2*n_hidden because of forward + backward cells and *2 again because of concat of shared&private
        #'dense': tf.Variable(tf.random_normal([(lstm_size*2)+emb_knowledge.shape[1], num_labels])),
        'dense': tf.Variable(tf.random_normal([lstm_size*2, num_labels])),
        'emb_sents': tf.Variable(tf.constant(0.0, shape=[emb_sents.shape[0], emb_sents.shape[1]]),
                trainable=train_embeddings, name="emb_sents"),
        'emb_knowledge': tf.Variable(tf.constant(0.0, shape=[emb_knowledge.shape[0], emb_knowledge.shape[1]]),
                trainable=train_embeddings, name="emb_knowledge")
    }
    biases = {
        'dense': tf.Variable(tf.random_normal([num_labels])),
    }

    # Embedd sentences and knowledge
    # Source: https://stackoverflow.com/questions/35687678/using-a-pre-trained-word-embedding-word2vec-or-glove-in-tensorflow
    emb_sents_init = weights['emb_sents'].assign(EMB_SENTS)
    embedded_word_ids = tf.nn.embedding_lookup(emb_sents_init, X) # [samples, sent_len, we_dim]
    emb_knowledge_init = weights['emb_knowledge'].assign(EMB_KNOWLEDGE)
    emb_knowledge_ids = tf.nn.embedding_lookup(emb_knowledge_init, KX) # [samples, sent_len, max_concepts, kge_dim]

    #bilstm, bilstm_last = DynamicBiRNN(embedded_word_ids, sent_len, lstm_size, name="bilstm")
    #bilstm_do_last = tf.nn.dropout(bilstm_last, dropout_const, seed=operation_level_seed)
    #bilstm_do = tf.nn.dropout(bilstm, dropout_const, seed=operation_level_seed)

    # attention
    initial_t = tf.constant(0)
    initial_outputs = tf.TensorArray(dtype=tf.float32, size=sent_len)
    #shit = tf.placeholder(tf.float32, [None, emb_knowledge.shape[1]])
    condition = lambda t, _: tf.less(t, sent_len)
    #body = lambda i: tf.add(i, 1)
    def body(t, outputs_):
        kX_sliced = emb_knowledge_ids[:, t,:,:] # [samples, token at t, concepts, concept_embs]
        X_token = embedded_word_ids[:, t,:]
        outputs_ = outputs_.write(t, attention_topic_and_knowledge(kX_sliced, X_topic[:, :], X_token, attention_size, return_alphas=False)) #[ [samples, kge], [samples, kge], ...] =>[sent_len, samples, kge]
        return t + 1, outputs_

    # do the loop:
    _, outputs = tf.while_loop(condition, body, loop_vars=[initial_t, initial_outputs])#,
    stack = outputs.stack() # [sent_len, samples, kge_dim]
    attended_knowledge = tf.transpose(stack, [1,0,2]) #[samples, sent_len, kge_dim]

    concat_sequences = tf.concat([embedded_word_ids, attended_knowledge, X_topic_cos], axis=2)  # the dimension where I want to add two tensors can be different, all others have to be equal

    bilstm_2, bilstm_last_2 = DynamicBiRNN(concat_sequences, sent_len, lstm_size, name="bilstm_2")

    bilstm_do_2 = tf.nn.dropout(bilstm_last_2, dropout_const, seed=operation_level_seed)

    dense = tf.matmul(bilstm_do_2, weights['dense']) + biases['dense']

    prediction_ukp = tf.nn.softmax(dense)

    # Define loss and optimizer (https://stats.stackexchange.com/questions/327348/how-is-softmax-cross-entropy-with-logits-different-from-softmax-cross-entropy-wi)
    loss_ukp = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(
        logits=prediction_ukp, labels=Y))
    optimizer_ukp = tf.train.AdamOptimizer(learning_rate=learning_rate) # adam with 0.01 => no learning, GDC w/ 0.01 good
    train_ukp = optimizer_ukp.minimize(loss_ukp)

    # Evaluate model (with test logits, for dropout to be disabled)
    correct_pred_ukp = tf.argmax(prediction_ukp, 1)
    correct_pred_ukp_eq = tf.equal(correct_pred_ukp, tf.argmax(Y, 1))
    accuracy_ukp = tf.reduce_mean(tf.cast(correct_pred_ukp_eq, tf.float32))

    # Initialize the variables (i.e. assign their default value)
    init = tf.global_variables_initializer()

    # init saver
    saver = tf.train.Saver()

    # Start training
    with tf.Session(config=config) as sess:
        best_f1_score = 0

        # Run the initializer
        sess.run(init)

        for epoch in range(epochs): # iterate over epochs
            loss, acc, loss_val, acc_val, pred_list_ukp = 0, 0, 0, 0, np.array([])
            print("====== Start epoch " + str(epoch) + " for UKP ======")
            # https://stackoverflow.com/questions/44565186/how-to-implement-next-batch-function-for-custom-data-in-python
            for p in range(periods_ukp+1): # each period trains samples of length batch_size
                # Run optimization op (backprop)
                _, loss_t, acc_t = sess.run([train_ukp, loss_ukp, accuracy_ukp],
                                          feed_dict={X: X_train[p*batch_size:(p+1)*batch_size],
                                                     KX: kX_train[p * batch_size:(p + 1) * batch_size],
                                                     X_topic: X_topic_train[p * batch_size:(p + 1) * batch_size],
                                                     X_topic_cos: X_topic_cos_train[p * batch_size:(p + 1) * batch_size],
                                                     Y: y_train[p*batch_size:(p+1)*batch_size],
                                                     EMB_SENTS: emb_sents, EMB_KNOWLEDGE: emb_knowledge,
                                                     dropout_const: 1-dropout})
                loss += loss_t
                acc += acc_t
                print("Train_period= "+str(p)+"/"+str(periods_ukp)+", train_loss= " + "{:.4f}".format(loss / p) + ", train_acc= " + "{:.3f}".format(acc/p), end='\r')

            for p in range(periods_ukp_val+1):
                loss_val_t, acc_val_t, pred_list_ukp_t = sess.run([loss_ukp, accuracy_ukp, correct_pred_ukp],
                                                                  feed_dict={X: X_dev[p*batch_size:(p+1)*batch_size],
                                                                             KX: kX_dev[p*batch_size:(p+1)*batch_size],
                                                                             X_topic: X_topic_dev[p * batch_size:(p + 1) * batch_size],
                                                                             X_topic_cos: X_topic_cos_dev[p * batch_size:(p + 1) * batch_size],
                                                                             Y: y_dev[p*batch_size:(p+1)*batch_size],
                                                                             EMB_SENTS: emb_sents, EMB_KNOWLEDGE: emb_knowledge,
                                                                             dropout_const: 1.0})
                loss_val += loss_val_t
                acc_val += acc_val_t
                pred_list_ukp = np.concatenate([pred_list_ukp, pred_list_ukp_t], axis=-1)


            temp_F1_score = f1_score(val_y_non_one_hot, pred_list_ukp, average='macro')
            if temp_F1_score > best_f1_score:
                best_f1_score = temp_F1_score
                # save model if better than previous one
                # check if current model is better, if yes => save
                save_path = saver.save(sess, model_file,
                                       latest_filename="checkpoint_" + topic + "_" + kwargs['model_settings'][
                                           "model_file_suffix"])

            print("train_loss= " + "{:.4f}".format(loss) + ", train_acc= " + \
                  "{:.3f}".format(acc/(periods_ukp+1)) + ", val_loss= " + "{:.4f}".format(loss_val) + ", val_acc= " + \
                  "{:.3f}".format(acc_val/(periods_ukp_val+1)) + ", val_F1= " + "{:.3f}".format(temp_F1_score)
                  )

            # variables_names = [v.name for v in tf.trainable_variables() if "bilstm_shared/bidirectional_rnn/fw/" in v.name]
            # print(variables_names)
            # values = sess.run(variables_names)
            # for k, v in zip(variables_names, values):
            #    print(k, v)

        saver.restore(sess, model_file)

        return sess.run(correct_pred_ukp, feed_dict={X: X_test, KX: kX_test, X_topic: X_topic_test, X_topic_cos: X_topic_cos_test, Y: y_test,
                                                     EMB_SENTS: emb_sents, EMB_KNOWLEDGE: emb_knowledge, dropout_const: 1.0}).tolist(),\
               sess.run(correct_pred_ukp, feed_dict={X: X_dev, KX: kX_dev, X_topic: X_topic_dev, X_topic_cos: X_topic_cos_dev, Y: y_dev,
                                                     EMB_SENTS: emb_sents, EMB_KNOWLEDGE: emb_knowledge, dropout_const: 1.0}).tolist()
Example #4
0
def train_model(data, topic, PROCESSED_DIR, SEED_FOLDER, **kwargs):
    def func(x):
        liste = []
        for i in range(sent_len):
            liste.append(TimeDistributed(paths_bilstm)(x[:, i, :, :, :])
                         )  # [bs, max_paths, max_path_len, emb_dim] * sent_len
        stacked = K.stack(liste, axis=1)
        return stacked

    dropout = kwargs['model_settings']["dropout"]
    lstm_size = kwargs['model_settings']["lstm_size"]
    monitor = kwargs['model_settings']["monitor"]
    batch_size = kwargs['model_settings']["batch_size"]
    epochs = kwargs['model_settings']["epochs"]
    learning_rate = kwargs['model_settings']["learning_rate"]
    train_embeddings = kwargs['model_settings']["train_embeddings"]
    # model file eg: 'results/only_sub_and_inst/model_runs/EvLSTM/seed_0/death_penalty_threelabel_crossdomain_monitor-f1_macro_do-0.3_lsize-32_bs-32_epochs-20_lr-0.001_trainemb-False_kl-only_sub_and_inst'
    model_file = SEED_FOLDER + topic + "_" + kwargs['model_settings'][
        "model_file_suffix"]
    seed = kwargs['model_settings']['current_seed']

    # clear default graph (new model now)
    #tf.reset_default_graph()

    # set configs for memory usage and reproducibility: https://stackoverflow.com/questions/38469632/tensorflow-non-repeatable-results
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    rn.seed(seed)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = False
    config.gpu_options.per_process_gpu_memory_fraction = 0.3
    np.random.seed(seed)
    #graph_level_seed = seed
    operation_level_seed = seed
    #tf.set_random_seed(graph_level_seed)

    # load embeddings
    emb_sents = np.load(PROCESSED_DIR + "index_to_vec_we" +
                        kwargs['model_settings']['word_embeddings'][1] +
                        ".npy")
    emb_knowledge = np.load(PROCESSED_DIR + "index_to_vec_kge" +
                            kwargs['model_settings']['kg_embeddings'][1] +
                            ".npy")

    # load data
    X_train, X_dev, X_test = data["X_train"], data["X_dev"], data[
        "X_test"]  # [samples, sent_len]
    kX_train, kX_dev, kX_test = data["kX_train"], data["kX_dev"], data[
        "kX_test"]  # [samples, sent_len, max_concepts]
    y_train, y_dev, y_test = data["y_train"], data["y_dev"], data["y_test"]
    val_y_non_one_hot = [np.argmax(pred) for pred in y_dev]

    # load vocab we and get indices for topic
    vocab_we = load_from_pickle(PROCESSED_DIR + "vocab_we.pkl")

    # generate topic data
    X_topic_train = get_avg_embedding_for_topic_list(data['X_topic_train'],
                                                     emb_sents, vocab_we)
    X_topic_dev = get_avg_embedding_for_topic_list(data['X_topic_dev'],
                                                   emb_sents, vocab_we)
    X_topic_test = get_avg_embedding_for_topic_list(data['X_topic_test'],
                                                    emb_sents, vocab_we)

    # some constants
    sent_len = X_train.shape[1]
    max_paths = kX_train.shape[2]
    max_path_len = kX_train.shape[3]
    num_labels = y_train.shape[1]
    attention_size = kwargs['model_settings'].get('attention_size',
                                                  emb_sents.shape[1])

    ############################
    #   KNOWLEDGE PROCESSING   #
    ############################

    # input for all concepts of a sentence
    sentence_inputs = Input(shape=(sent_len, ),
                            dtype='int32',
                            name="sentence_inputs")
    topic_vector_input = Input(shape=(emb_sents.shape[1], ),
                               dtype='float32',
                               name="topic_vector_sent_wise")
    knowledge_inputs = Input(shape=(
        sent_len,
        max_paths,
        max_path_len,
    ),
                             dtype='int32',
                             name="knowledge_inputs")

    emb_knowledge_ids = Embedding(
        emb_knowledge.shape[0],
        emb_knowledge.shape[1],
        mask_zero=True,
        weights=[emb_knowledge],
        trainable=train_embeddings)(
            knowledge_inputs)  # [samples, sent_len, max_concepts, kge_dim]

    embedded_word_ids = Embedding(
        emb_sents.shape[0],
        emb_sents.shape[1],
        mask_zero=True,
        weights=[emb_sents],
        trainable=train_embeddings,
        input_length=sent_len)(sentence_inputs)  # [samples, sent_len, we_dim]

    # function that reduces the paths to a single vector => from there on, model is equal to the shallow model
    # in: [bs, sent_len, max_concepts, max_path_len, kge_dim], out: [bs, sent_len, max_concepts, 2*lstm_size]
    paths_bilstm = Bidirectional(
        LSTM(lstm_size))  # define lstm that reduces the paths to one vector
    reduce_paths_to_vector = Lambda(
        func,
        output_shape=(sent_len, max_paths, 2 * lstm_size))(emb_knowledge_ids)

    attended_knowledge = attention_knowledge(
        embedded_word_ids,
        topic_vector_input,
        attention_size,
        return_alphas=False,
        summed_up=True)(reduce_paths_to_vector)

    concat_sequences = Lambda(lambda x: tf.concat([x[0], x[1]], axis=-1))(
        [embedded_word_ids, attended_knowledge])

    # define bilstm + dropout
    sent_bilstm = Bidirectional(
        LSTM(lstm_size,
             input_shape=(None, sent_len, attention_size +
                          emb_sents.shape[1])))(concat_sequences)
    sent_bilstm_dropout = Dropout(dropout)(sent_bilstm)

    output_layer = Dense(num_labels, activation='softmax')(sent_bilstm_dropout)

    model = Model(
        inputs=[sentence_inputs, knowledge_inputs, topic_vector_input],
        outputs=output_layer)

    adam = Adam(lr=learning_rate)
    model.compile(loss='categorical_crossentropy',
                  optimizer=adam,
                  metrics=['accuracy'])

    #e = EarlyStopping(monitor=monitor, mode='auto')
    e = ModelCheckpoint(model_file,
                        monitor=monitor,
                        verbose=0,
                        save_best_only=True,
                        save_weights_only=True,
                        mode='auto',
                        period=1)
    model.fit([X_train, kX_train, X_topic_train],
              y_train,
              batch_size=batch_size,
              epochs=epochs,
              validation_data=([X_dev, kX_dev, X_topic_dev], y_dev),
              callbacks=[e],
              verbose=1)
    model.load_weights(model_file)

    y_pred_test = model.predict([X_test, kX_test, X_topic_test], verbose=False)
    y_pred_dev = model.predict([X_dev, kX_dev, X_topic_dev], verbose=False)

    return [np.argmax(pred)
            for pred in y_pred_test], [np.argmax(pred) for pred in y_pred_dev]