DATA_DIR = "../data/comp_data" MODEL_DIR = "../data/models" WORD2VEC_BIN = "GoogleNews-vectors-negative300.bin.gz" WORD2VEC_EMBED_SIZE = 300 QA_TRAIN_FILE = "8thGr-NDMC-Train.csv" STORY_FILE = "studystack_qa_cleaner_no_qm.txt" QA_EMBED_SIZE = 64 BATCH_SIZE = 32 NBR_EPOCHS = 20 ## extract data print("Loading and formatting data...") qapairs = kaggle.get_question_answer_pairs( os.path.join(DATA_DIR, QA_TRAIN_FILE)) question_maxlen = max([len(qapair[0]) for qapair in qapairs]) answer_maxlen = max([len(qapair[1]) for qapair in qapairs]) seq_maxlen = max([question_maxlen, answer_maxlen]) word2idx = kaggle.build_vocab([], qapairs, []) vocab_size = len(word2idx) + 1 # include mask character 0 Xq, Xa, Y = kaggle.vectorize_qapairs(qapairs, word2idx, seq_maxlen) Xqtrain, Xqtest, Xatrain, Xatest, Ytrain, Ytest = \ train_test_split(Xq, Xa, Y, test_size=0.3, random_state=42) print(Xqtrain.shape, Xqtest.shape, Xatrain.shape, Xatest.shape, Ytrain.shape, Ytest.shape) # get embeddings from word2vec # see https://github.com/fchollet/keras/issues/853
#MODEL_WEIGHTS = "qa-lstm-model-best.hdf5" MODEL_ARCH = "qa-lstm-fem-attn.json" MODEL_WEIGHTS = "qa-lstm-fem-attn-final.h5" DATA_DIR = "../data/comp_data" QA_TRAIN_FILE = "8thGr-NDMC-Train.csv" QA_TEST_FILE = "8thGr-NDMC-Test.csv" WORD2VEC_BIN = "GoogleNews-vectors-negative300.bin.gz" WORD2VEC_EMBED_SIZE = 300 LSTM_SEQLEN = 196 # from original model NUM_CHOICES = 4 # number of choices for multiple choice #### Load up the vectorizer qapairs = kaggle.get_question_answer_pairs( os.path.join(DATA_DIR, QA_TRAIN_FILE)) tqapairs = kaggle.get_question_answer_pairs(os.path.join( DATA_DIR, QA_TEST_FILE), is_test=True) word2idx = kaggle.build_vocab([], qapairs, tqapairs) vocab_size = len(word2idx) + 1 # include mask character 0 #### Load up the model with open(os.path.join(MODEL_DIR, MODEL_ARCH), "rb") as fjson: json = fjson.read() model = model_from_json(json) model.load_weights(os.path.join(MODEL_DIR, MODEL_WEIGHTS)) #### read in the data #### #### correct_answer = "B"
import kaggle MODEL_DIR = "data/qa-lstm.json" WORD2VEC_BIN = "data/corpusWord2Vec.bin" WORD2VEC_EMBED_SIZE = 300 filepath = 'data/model_lstm.h5' QA_TRAIN_FILE = "data/train_data2.txt" QA_EMBED_SIZE = 64 BATCH_SIZE = 32 NBR_EPOCHS = 1 ## extract data asc print("Loading and formatting data...") qapairs = kaggle.get_question_answer_pairs(QA_TRAIN_FILE) question_maxlen = max([len(qapair[0]) for qapair in qapairs]) answer_maxlen = max([len(qapair[1]) for qapair in qapairs]) seq_maxlen = max([question_maxlen, answer_maxlen]) word2idx = kaggle.build_vocab([], qapairs, []) vocab_size = len(word2idx) + 1 # include mask character 0 print(vocab_size) Xq, Xa, Y = kaggle.vectorize_qapairs(qapairs, word2idx, seq_maxlen) ##长度变成一样 Xqtrain, Xqtest, Xatrain, Xatest, Ytrain, Ytest = \ train_test_split(Xq, Xa, Y, test_size=0.3, random_state=42) print(Xqtrain.shape, Xqtest.shape, Xatrain.shape, Xatest.shape, Ytrain.shape, Ytest.shape)