prepare_data = lambda data: (to_seq(data[0]), to_seq(data[1]), data[2])

training = prepare_data(training)
validation = prepare_data(validation)
test = prepare_data(test)

premise = Input(shape=(MAX_LEN, ), dtype='int32')
hypothesis = Input(shape=(MAX_LEN, ), dtype='int32')

prem_reps = []  # premise sentence representations
hypo_reps = []  # hypothesis sentence representations

# read in embedding and translate
if args.agg_we != None or args.align_op_we != None:
    print(" fetching word embedding")
    embedding_matrix = get_embedding_matrix(args.embedding, VOCAB,
                                            EMBED_HIDDEN_SIZE, tokenizer)
    embed = Embedding(VOCAB,
                      EMBED_HIDDEN_SIZE,
                      weights=[embedding_matrix],
                      input_length=MAX_LEN,
                      trainable=False)

    prem = embed(premise)
    hypo = embed(hypothesis)

    if args.timedist:
        translate = TimeDistributed(
            Dense(SENT_HIDDEN_SIZE, activation=ACTIVATION))

        prem = translate(prem)
        hypo = translate(hypo)
Ejemplo n.º 2
0
EMBEDDING_DIM = 200
MAX_SEQUENCE_LENGTH = 200
MAX_JACCARD_LENGTH = 30
INC_BATCH_SIZE = 80000

BASE_DIR = ''
# W2V_MODEL_DIR = '/Users/knight/Desktop/GodClassDetection/embedding_model/new_model6.bin'
W2V_MODEL_DIR = '/Users/knight/Desktop/GodClassDetection/embedding_model/new_model6_nltk.bin'
TRAIN_SET_DIR = '/Users/knight/Desktop/GodClassDetection/trainset'  # 直接改成自己的路径
FULL_MN_DIR = TRAIN_SET_DIR

tokenizer = preprocess.get_tokenizer(FULL_MN_DIR)
all_word_index = tokenizer.word_index
embedding_matrix = preprocess.get_embedding_matrix(all_word_index,
                                                   W2V_MODEL_DIR,
                                                   dim=EMBEDDING_DIM)

acc_list = []
loss_list = []

print("11111111111111111")
x_train, y_train = preprocess.get_xy_train(TRAIN_SET_DIR + '/finetune',
                                           tokenizer=tokenizer,
                                           mn_maxlen=MAX_SEQUENCE_LENGTH,
                                           embedding_matrix=embedding_matrix)

print('Fine tune model.')

# 微调
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
    for i in range(0, m):
        sentences1.append(text_to_word_list(questions1[i]))
        sentences2.append(text_to_word_list(questions2[i]))

    print 'Corpus length = %d' % m

    documents = sentences1 + sentences2

    # Create the tokenizer
    tokenizer = Tokenizer()

    # fit the tokenizer on the documents
    tokenizer.fit_on_texts(documents)

    # Get the word to index dictionary
    word_to_idx = tokenizer.word_index
    print 'Vocabulary size = %d' % len(word_to_idx)

    # Generate the embedding matrix
    embedding_matrix, word2vec = get_embedding_matrix(
        word_to_idx, documents, network_config['pre_trained_vector_flag'])

    # Create training, validation and test set
    train_validation_dict = create_train_dev_test_set(tokenizer, sentences1,
                                                      sentences2, sim_score)

    # Train the model
    lstm_network = BiLSTMNetwork()
    lstm_network.train_model(train_validation_dict, embedding_matrix)
OPTIMIZER = 'rmsprop'

to_seq = lambda X: pad_sequences(tokenizer.texts_to_sequences(X),
                                 maxlen=MAX_LEN)
prepare_data = lambda data: (to_seq(data[0]), to_seq(data[1]), data[2])

training = prepare_data(training)
validation = prepare_data(validation)
test = prepare_data(test)

premise = Input(shape=(MAX_LEN, ), dtype='int32')
hypothesis = Input(shape=(MAX_LEN, ), dtype='int32')

# read in embedding and translate
print("> fetching word embedding")
embedding_matrix = get_embedding_matrix(args.embedding, VOCAB,
                                        EMBED_HIDDEN_SIZE, tokenizer)
embed = Embedding(VOCAB,
                  EMBED_HIDDEN_SIZE,
                  weights=[embedding_matrix],
                  input_length=MAX_LEN,
                  trainable=False)

prem = embed(premise)
hypo = embed(hypothesis)

translate = TimeDistributed(Dense(SENT_HIDDEN_SIZE, activation=ACTIVATION))

prem = translate(prem)
hypo = translate(hypo)

alignment = _align(prem, hypo, normalize=True)
Ejemplo n.º 5
0
PATIENCE = int(args.patience)
BATCH_SIZE = 512
DENSE_NEURON_COUNT = int(args.neurons)
DP = 0.2
L2 = 4e-6

to_seq = lambda X: pad_sequences(tokenizer.texts_to_sequences(X),
                                 maxlen=SENTENCE_MAX_LEN)
prepare_data = lambda data: (to_seq(data[0]), to_seq(data[1]), data[2])

training = prepare_data(training)
validation = prepare_data(validation)
test = prepare_data(test)

logging.info("> fetching embedding")
embedding_matrix = get_embedding_matrix(emb_file, VOCAB_SIZE, WORD_DIM,
                                        tokenizer)

logging.info("> fetching antonym embedding")
ant_embedding_matrix = get_embedding_matrix(ant_emb_file, VOCAB_SIZE,
                                            ANT_WORD_DIM, tokenizer)

embed = Embedding(VOCAB_SIZE,
                  WORD_DIM,
                  weights=[embedding_matrix],
                  input_length=SENTENCE_MAX_LEN,
                  trainable=False)
ant_embed = Embedding(VOCAB_SIZE,
                      ANT_WORD_DIM,
                      weights=[ant_embedding_matrix],
                      input_length=SENTENCE_MAX_LEN,
                      trainable=False)
Ejemplo n.º 6
0
import numpy as np
import preprocess as pp
from keras.models import model_from_json

question1, question2 = pp.extract_data("quora-question-pairs/test.csv", 'test')
question1_word_sequences, question2_word_sequences, word_index = pp.tokenize(
    question1, question2)
embeddings_index = pp.get_embeddings("glove.840B.300d/glove.840B.300d.txt")
nb_words, word_embedding_matrix = pp.get_embedding_matrix(
    word_index, embeddings_index)
q1_data, q2_data, word_embedding_matrix, nb_words = pp.process_data(
    question1_word_sequences, question2_word_sequences, word_embedding_matrix,
    nb_words, 'test')

X_train = np.stack((q1_data, q2_data), axis=1)
Q1_train = X_train[:, 0]
Q2_train = X_train[:, 1]

json_file = open('best_weights/model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
model = model_from_json(loaded_model_json)
model.load_weights("best_weights/weights.h5")
print("Loaded model from disk")

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
score = model.predict([Q1_train, Q2_train])
print(score)
Ejemplo n.º 7
0
            a = K.l2_normalize(a, axis=2)
            b = K.l2_normalize(b, axis=2)

        return K.batch_dot(a, b, axes=[2, 2])

    def compute_output_shape(self, input_shape):
        a_shape, b_shape = input_shape
        return (a_shape[0], a_shape[1], b_shape[1])


premise = Input(shape=(MAX_LEN, ), dtype='int32')
hypothesis = Input(shape=(MAX_LEN, ), dtype='int32')

# read in embedding and translate
print("> fetching word embedding")
embedding_matrix = get_embedding_matrix(args.embedding, VOCAB, 300, tokenizer)
embed = Embedding(VOCAB,
                  300,
                  weights=[embedding_matrix],
                  input_length=42,
                  trainable=False)

prem = embed(premise)
hypo = embed(hypothesis)

translate = TimeDistributed(Dense(300, activation="relu"))

hypo = translate(prem)
prem = translate(hypo)

perspectives = 5