Example #1
0
DATA_DIR = "../data/comp_data"
MODEL_DIR = "../data/models"
WORD2VEC_BIN = "GoogleNews-vectors-negative300.bin.gz"
WORD2VEC_EMBED_SIZE = 300

QA_TRAIN_FILE = "8thGr-NDMC-Train.csv"
STORY_FILE = "studystack_qa_cleaner_no_qm.txt"

QA_EMBED_SIZE = 64
BATCH_SIZE = 32
NBR_EPOCHS = 20

## extract data

print("Loading and formatting data...")
qapairs = kaggle.get_question_answer_pairs(
    os.path.join(DATA_DIR, QA_TRAIN_FILE))
question_maxlen = max([len(qapair[0]) for qapair in qapairs])
answer_maxlen = max([len(qapair[1]) for qapair in qapairs])
seq_maxlen = max([question_maxlen, answer_maxlen])

word2idx = kaggle.build_vocab([], qapairs, [])
vocab_size = len(word2idx) + 1  # include mask character 0

Xq, Xa, Y = kaggle.vectorize_qapairs(qapairs, word2idx, seq_maxlen)
Xqtrain, Xqtest, Xatrain, Xatest, Ytrain, Ytest = \
    train_test_split(Xq, Xa, Y, test_size=0.3, random_state=42)
print(Xqtrain.shape, Xqtest.shape, Xatrain.shape, Xatest.shape, Ytrain.shape,
      Ytest.shape)

# get embeddings from word2vec
# see https://github.com/fchollet/keras/issues/853
#MODEL_WEIGHTS = "qa-lstm-model-best.hdf5"
MODEL_ARCH = "qa-lstm-fem-attn.json"
MODEL_WEIGHTS = "qa-lstm-fem-attn-final.h5"

DATA_DIR = "../data/comp_data"
QA_TRAIN_FILE = "8thGr-NDMC-Train.csv"
QA_TEST_FILE = "8thGr-NDMC-Test.csv"

WORD2VEC_BIN = "GoogleNews-vectors-negative300.bin.gz"
WORD2VEC_EMBED_SIZE = 300

LSTM_SEQLEN = 196  # from original model
NUM_CHOICES = 4  # number of choices for multiple choice

#### Load up the vectorizer
qapairs = kaggle.get_question_answer_pairs(
    os.path.join(DATA_DIR, QA_TRAIN_FILE))
tqapairs = kaggle.get_question_answer_pairs(os.path.join(
    DATA_DIR, QA_TEST_FILE),
                                            is_test=True)

word2idx = kaggle.build_vocab([], qapairs, tqapairs)
vocab_size = len(word2idx) + 1  # include mask character 0

#### Load up the model
with open(os.path.join(MODEL_DIR, MODEL_ARCH), "rb") as fjson:
    json = fjson.read()
model = model_from_json(json)
model.load_weights(os.path.join(MODEL_DIR, MODEL_WEIGHTS))

#### read in the data ####
#### correct_answer = "B"
Example #3
0
File: qa-lstm.py Project: onezqz/qa
import kaggle

MODEL_DIR = "data/qa-lstm.json"
WORD2VEC_BIN = "data/corpusWord2Vec.bin"
WORD2VEC_EMBED_SIZE = 300
filepath = 'data/model_lstm.h5'
QA_TRAIN_FILE = "data/train_data2.txt"

QA_EMBED_SIZE = 64
BATCH_SIZE = 32
NBR_EPOCHS = 1

## extract data   asc

print("Loading and formatting data...")
qapairs = kaggle.get_question_answer_pairs(QA_TRAIN_FILE)

question_maxlen = max([len(qapair[0]) for qapair in qapairs])
answer_maxlen = max([len(qapair[1]) for qapair in qapairs])
seq_maxlen = max([question_maxlen, answer_maxlen])

word2idx = kaggle.build_vocab([], qapairs, [])
vocab_size = len(word2idx) + 1  # include mask character 0
print(vocab_size)
Xq, Xa, Y = kaggle.vectorize_qapairs(qapairs, word2idx, seq_maxlen)  ##长度变成一样

Xqtrain, Xqtest, Xatrain, Xatest, Ytrain, Ytest = \
    train_test_split(Xq, Xa, Y, test_size=0.3, random_state=42)
print(Xqtrain.shape, Xqtest.shape, Xatrain.shape, Xatest.shape, Ytrain.shape,
      Ytest.shape)