QA_TEST_FILE = "8thGr-NDMC-Test.csv"

WORD2VEC_BIN = "GoogleNews-vectors-negative300.bin.gz"
WORD2VEC_EMBED_SIZE = 300

LSTM_SEQLEN = 196  # from original model
NUM_CHOICES = 4  # number of choices for multiple choice

#### Load up the vectorizer
qapairs = kaggle.get_question_answer_pairs(
    os.path.join(DATA_DIR, QA_TRAIN_FILE))
tqapairs = kaggle.get_question_answer_pairs(os.path.join(
    DATA_DIR, QA_TEST_FILE),
                                            is_test=True)

word2idx = kaggle.build_vocab([], qapairs, tqapairs)
vocab_size = len(word2idx) + 1  # include mask character 0

#### Load up the model
with open(os.path.join(MODEL_DIR, MODEL_ARCH), "rb") as fjson:
    json = fjson.read()
model = model_from_json(json)
model.load_weights(os.path.join(MODEL_DIR, MODEL_WEIGHTS))

#### read in the data ####
#### correct_answer = "B"
question = "Which is a distinction between an epidemic and a pandemic?"
answers = [
    "the symptoms of the disease", "the geographical area affected",
    "the species of organisms infected",
    "the season in which the disease spreads"
Beispiel #2
0
EMBED_SIZE = 64
BATCH_SIZE = 256
NBR_EPOCHS = 20

stories = kaggle.get_stories(os.path.join(DATA_DIR, STORY_FILE))
story_maxlen = max([len(words) for words in stories])

# this part is only required to get the maximum sequence length
qapairs = kaggle.get_question_answer_pairs(
    os.path.join(DATA_DIR, QA_TRAIN_FILE))
question_maxlen = max([len(qapair[0]) for qapair in qapairs])
answer_maxlen = max([len(qapair[1]) for qapair in qapairs])
seq_maxlen = max([story_maxlen, question_maxlen, answer_maxlen])

word2idx = kaggle.build_vocab(stories, qapairs, [])
vocab_size = len(word2idx)

Xs = kaggle.vectorize_stories(stories, word2idx, seq_maxlen)
Xstrain, Xstest = train_test_split(Xs, test_size=0.3, random_state=42)
print(Xstrain.shape, Xstest.shape)

inputs = Input(shape=(seq_maxlen, vocab_size))
encoded = LSTM(EMBED_SIZE)(inputs)
decoded = RepeatVector(seq_maxlen)(encoded)
decoded = LSTM(vocab_size, return_sequences=True)(decoded)
autoencoder = Model(inputs, decoded)

autoencoder.compile("adadelta", loss="binary_crossentropy")

autoencoder.fit(Xstrain,