def binary_test(paths, weights_file=None): assert (isinstance(paths, list)) # Read test data. test_data = [] for path in paths: test_data += read_data_as_json(path) if DEBUG: print_data_stats(test_data, "Binary accuracy") # Fit a tokenizer on all data. Each word gets assigned a number # between 1 and num_words. tokenizer = tokenizers.SpacyTokenizer() tokenizer.fit_on_texts(all_sentences(test_data)) if DEBUG: print("Num words: {}".format(len(tokenizer.word_counts()))) # Load char embeddings. ce_loader = CharEmbeddings(CHAR_EMBEDDINGS_PATH, CHAR_EMBEDDINGS_DIM) # Convert data into list of sequences of indices. (test_data, test_labels) = preprocess_data(test_data, tokenizer, ce_loader, shuffle=False, oversample=True) embeddings_matrix = build_embeddings_matrix(tokenizer) num_words = len(tokenizer.word_counts()) model = define_model(num_words, embeddings_matrix, ce_loader, "test", WORD_EMBEDDINGS_DIM) model.summary() if weights_file is None: weights_file = pick_best_model_from_dir() if DEBUG: print("Best model detected: {}".format(weights_file)) model.load_weights(weights_file, by_name=True) num_tests = test_labels.shape[0] y = model.predict(test_data, batch_size=64) assert (y.shape[0] == num_tests) correct = 0 total = 0 for i in range(0, num_tests): expected = np.argmax(test_labels[i]) predicted = np.argmax(y[i]) if predicted == expected: correct += 1 total += 1 assert (total == num_tests) if total == 0: total = 1 print("\nEvaluated on {} questions.".format(num_tests)) print("Accuracy: {0:.3f}%".format(100.0 * correct / total))
def predict_batch(request, weights_file, verbose=False): assert (isinstance(request, list)) assert (isinstance(weights_file, str)) assert (isinstance(verbose, bool)) num_questions = len(request) if verbose: print_data_stats(request, "QA Predict batch request") # Fit a tokenizer on all data. Each word gets assigned a number # between 1 and num_words. tokenizer = tokenizers.SpacyTokenizer() tokenizer.fit_on_texts(all_sentences(request)) if verbose: print("Num words: {}".format(len(tokenizer.word_counts()))) # Load char embeddings. ce_loader = CharEmbeddings(CHAR_EMBEDDINGS_PATH, CHAR_EMBEDDINGS_DIM) # Convert data into list of sequences of indices. (test_data, _) = preprocess_data(request, tokenizer, ce_loader, shuffle=False, oversample=False) embeddings_matrix = build_embeddings_matrix(tokenizer) num_words = len(tokenizer.word_counts()) model = define_model(num_words, embeddings_matrix, ce_loader, "test", WORD_EMBEDDINGS_DIM) model.load_weights(weights_file, by_name=True) # model.summary() num_tests = num_questions * 4 y = model.predict(test_data, batch_size=64) assert (y.shape[0] == num_tests) assert (num_tests % 4 == 0) res = np.zeros((num_questions, 4)) idx = 0 for i in range(0, num_tests, 4): for j in range(1, 4): assert (np.allclose(test_data["q_input"][i], test_data["q_input"][i + j])) assert (np.allclose(test_data["q_char_input"][i], test_data["q_char_input"][i + j])) assert (i % 4 == 0) res[idx] = y[i:i + 4, 1] assert (idx == (i >> 2)) idx += 1 assert (idx == num_questions) print("\n[QA] Batch predicted {} questions.".format(num_questions)) assert (res.shape == (num_questions, 4)) return res
def _get_unknown_embeddings_list(dataset_paths): assert(isinstance(dataset_paths, list)) data = [] for path in dataset_paths: data += read_data_as_json(path) print_data_stats(data, "Combined") tokenizer = tokenizers.SpacyTokenizer() tokenizer.fit_on_texts(all_sentences(data)) print("\nNum words: {}".format(len(tokenizer.word_counts())), flush=True) word_index = tokenizer.word_index() embedder = WordEmbeddings(WORD_EMBEDDINGS_PATH, WORD_EMBEDDINGS_DIM, True) dim = embedder.get_embedding_len() emb_found = 0 num_processed = 0 with open("words_not_found.txt", "w") as g: for word in word_index: if num_processed % 100 == 1: print("Processed {} out of {} words.".format( num_processed, len(word_index))) num_processed += 1 w_vector = embedder.get_vector(word) if w_vector is None: w_vector = embedder.get_vector(word.lower()) if w_vector is None: corrected_word = spell(word) assert(isinstance(corrected_word, str)) w_vector = embedder.get_vector(corrected_word) if w_vector is not None: emb_found += 1 assert(w_vector.shape[0] == dim) else: g.write(word + "\n") g.flush() num_words = len(word_index) assert(num_words == num_processed) if num_words == 0: num_words = 1 print("Found {0:.2f}% embeddings.".format( 100.0 * emb_found / num_words))
def plot_attention_heatmap(paths, choice="smallest_context", weights_file=None): assert (isinstance(paths, list)) assert (choice in ["smallest_context", "random"]) # Read test data. test_data = [] for path in paths: test_data += read_data_as_json(path) data = None if choice == "smallest_context": min_len = None for entry in test_data: assert (len(entry["answers"]) == 4) context = entry["answers"][0]["context"] for i in range(0, 4): assert (context == entry["answers"][i]["context"]) if min_len is None or len(context) < min_len: min_len = len(context) data = entry elif choice == "random": import random data = random.choice(test_data) assert (data is not None) context = data["answers"][0]["context"] question_text = data["question"] test_data = [data] # Fit a tokenizer on all data. Each word gets assigned a number # between 1 and num_words. tokenizer = tokenizers.SpacyTokenizer() tokenizer.fit_on_texts(all_sentences(test_data)) if DEBUG: print("Num words: {}".format(len(tokenizer.word_counts()))) # Load char embeddings. ce_loader = CharEmbeddings(CHAR_EMBEDDINGS_PATH, CHAR_EMBEDDINGS_DIM) (test_data, labels) = preprocess_data(test_data, tokenizer, ce_loader, shuffle=False, oversample=False) idx = None for i in range(0, 4): if labels[i][1] >= 0.9: idx = i break assert (data["answers"][idx]["isCorrect"] is True) q = test_data["q_input"] a = test_data["a_input"] c = test_data["c_input"] q = np.reshape(q[idx], (1, -1)) a = np.reshape(a[idx], (1, -1)) c = np.reshape(c[idx], (1, -1)) embeddings_matrix = build_embeddings_matrix(tokenizer) num_words = len(tokenizer.word_counts()) model = attention_heatmap(num_words, embeddings_matrix, "attention_heatmap", WORD_EMBEDDINGS_DIM) model.summary() if weights_file is None: weights_file = pick_best_model_from_dir() if DEBUG: print("Best model detected: {}".format(weights_file)) model.load_weights(weights_file, by_name=True) inv_index = {} index = tokenizer.word_index() for word in index: inv_index[index[word]] = word seq = tokenizer.texts_to_sequences([context])[0] context = [] for idx in seq: context.append(inv_index[idx]) y = model.predict({'q_input': q, 'a_input': a, 'c_input': c}) assert (y.shape[0] == 1) y = y[0] scores = [] for i in range(0, len(context)): scores.append((context[i], y[i][0])) scores.sort(key=lambda x: x[1], reverse=True) scores = scores[:20] print("\nQuestion: " + question_text + "\n") print(' '.join(context)) x = [score[0] for score in scores] y = [score[1] for score in scores] import matplotlib.pyplot as plt plt.figure() plt.bar(range(len(x)), y, align='center') plt.title("Attention scores") plt.xticks(range(len(x)), x, rotation=45, horizontalalignment='right') plt.tight_layout() plt.show()
def train(): # Read train data. train_data = [] for path in TRAIN_DATA_PATH: train_data += read_data_as_json(path) # Read validation data. val_data = [] for path in VALIDATE_DATA_PATH: val_data += read_data_as_json(path) # Read test data. test_data = [] for path in TEST_DATA_PATH: test_data += read_data_as_json(path) # train_data = train_data[0:250] # val_data = val_data[0:50] if DEBUG: print_data_stats(train_data, "Train") print_data_stats(val_data, "Validate") print_data_stats(test_data, "Test") # Fit a tokenizer on all data. Each word gets assigned a number # between 1 and num_words. tokenizer = tokenizers.SpacyTokenizer() tokenizer.fit_on_texts( all_sentences(train_data) + all_sentences(val_data) + all_sentences(test_data)) if DEBUG: print("Num words: {}\n".format(len(tokenizer.word_counts()))) # Load char embeddings. ce_loader = CharEmbeddings(CHAR_EMBEDDINGS_PATH, CHAR_EMBEDDINGS_DIM) # Convert data into list of sequences of indices. (train_data, train_labels) = preprocess_data(train_data, tokenizer, ce_loader) if DEBUG: print("Train data preprocessing complete.", flush=True) (val_data, val_labels) = preprocess_data(val_data, tokenizer, ce_loader) if DEBUG: print("Val data preprocessing complete.", flush=True) (test_data, test_labels) = preprocess_data(test_data, tokenizer, ce_loader) if DEBUG: print("Test data preprocessing complete.\n", flush=True) embeddings_matrix = build_embeddings_matrix(tokenizer) num_words = len(tokenizer.word_counts()) model = define_model(num_words, embeddings_matrix, ce_loader, "train", WORD_EMBEDDINGS_DIM) model.summary() weights_file = pick_best_model_from_dir() print("Pretrain from {}".format(weights_file)) model.load_weights(weights_file, by_name=True) filepath = "models/" + "model.{val_acc:.3f}-{epoch:03d}.hdf5" checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=0, save_best_only=True, mode='max') model.fit(train_data, train_labels, batch_size=170, epochs=350, verbose=1, validation_data=(val_data, val_labels), callbacks=[checkpoint], shuffle=True) # Shuffles training data before training. score = model.evaluate(test_data, test_labels, verbose=0) if score: print('Test loss:', score[0]) print('Test accuracy:', score[1])