def predict(self, test_data, weights_file=None): assert (isinstance(test_data, list)) assert (isinstance(weights_file, str) or weights_file is None) # Extract ids. ids = [] for entry in test_data: ids.append(entry["id"]) # Augment data. test_data = self.augment_data(test_data) if DEBUG: print_data_stats(test_data, "Binary accuracy") tokenizer = tokenizers.SpacyTokenizer() tokenizer.fit_on_texts(all_sentences(test_data)) if DEBUG: print("Num words: {}\n".format(len(tokenizer.word_counts()))) test_data, _ = self.preprocess_data(test_data, tokenizer, "Predict", oversample=False) embeddings_matrix = Cerebro.build_embeddings_matrix(tokenizer) num_words = len(tokenizer.word_counts()) assert (embeddings_matrix.shape[0] == num_words + 1) model = self.define_model(embeddings_matrix, scope="test") if weights_file is None: weights_file = pick_best_model_from_dir() if DEBUG: print("Best model detected: {}".format(weights_file)) model.load_weights(weights_file, by_name=True) model.summary() num_tests = len(ids) * 4 y = model.predict(test_data) assert (y.shape[0] == num_tests) assert (num_tests % 4 == 0) total = 0 correct_answers = [] for i in range(0, num_tests, 4): predicted = y[i:i + 4, 1] predicted = np.argmax(predicted) correct_answers.append(predicted) total += 1 assert (total == len(correct_answers)) assert (len(ids) == len(correct_answers)) assert (total == num_tests / 4) rez = list(zip(ids, correct_answers)) rez = sorted(rez, key=lambda x: x[0]) return rez
def test_4way(self, test_data, weights_file=None): assert (isinstance(test_data, list)) assert (isinstance(weights_file, str) or weights_file is None) test_data = self.augment_data(test_data) if DEBUG: print_data_stats(test_data, "Binary accuracy") tokenizer = tokenizers.SpacyTokenizer() tokenizer.fit_on_texts(all_sentences(test_data)) if DEBUG: print("Num words: {}\n".format(len(tokenizer.word_counts()))) test_data, test_labels = self.preprocess_data(test_data, tokenizer, "Binary acc", oversample=False) embeddings_matrix = Cerebro.build_embeddings_matrix(tokenizer) num_words = len(tokenizer.word_counts()) assert (embeddings_matrix.shape[0] == num_words + 1) model = self.define_model(embeddings_matrix, scope="test") if weights_file is None: weights_file = pick_best_model_from_dir() if DEBUG: print("Best model detected: {}".format(weights_file)) model.load_weights(weights_file, by_name=True) model.summary() num_tests = test_labels.shape[0] y = model.predict(test_data) assert (y.shape[0] == num_tests) assert (num_tests % 4 == 0) correct = 0 total = 0 for i in range(0, num_tests, 4): expected = test_labels[i:i + 4, 1] assert (np.allclose(np.sum(expected), 1.0)) expected = np.argmax(expected) predicted = y[i:i + 4, 1] predicted = np.argmax(predicted) if predicted == expected: correct += 1 total += 1 assert (total == num_tests / 4) if total == 0: total = 1 print("\nEvaluated on {} questions.".format(total)) print("Accuracy: {0:.3f}%".format(100.0 * correct / total))
def print_diff(self, data, weights_file=None): assert (isinstance(data, list)) assert (isinstance(weights_file, str) or weights_file is None) data = self.augment_data(data) if DEBUG: print_data_stats(data, "Print Diff") tokenizer = tokenizers.SpacyTokenizer() tokenizer.fit_on_texts(all_sentences(data)) if DEBUG: print("Num words: {}\n".format(len(tokenizer.word_counts()))) test_data, test_labels = self.preprocess_data(data, tokenizer, "Print Diff", oversample=False) embeddings_matrix = Cerebro.build_embeddings_matrix(tokenizer) num_words = len(tokenizer.word_counts()) assert (embeddings_matrix.shape[0] == num_words + 1) model = self.define_model(embeddings_matrix, scope="test") if weights_file is None: weights_file = pick_best_model_from_dir() if DEBUG: print("Best model detected: {}".format(weights_file)) model.load_weights(weights_file, by_name=True) model.summary() num_tests = test_labels.shape[0] y = model.predict(test_data) assert (y.shape[0] == num_tests) assert (num_tests % 4 == 0) for i in range(0, num_tests, 4): expected = test_labels[i:i + 4, 1] assert (np.allclose(np.sum(expected), 1.0)) expected = np.argmax(expected) predicted = y[i:i + 4, 1] predicted = np.argmax(predicted) if predicted == expected: entry = data[int(i / 4)] question_text = entry["question"] tf_idf_scores = [x['tfIdfScore'] for x in entry["answers"]] assert (len(tf_idf_scores) == 4) assert (abs(sum(tf_idf_scores) - 1.0) <= 0.001) if np.argmax(tf_idf_scores) != predicted: print(question_text)
def predict_batch(self, test_data, weights_file=None): assert (isinstance(test_data, list)) assert (isinstance(weights_file, str) or weights_file is None) # Extract ids. ids = [] for entry in test_data: ids.append(entry["id"]) # Augment data. test_data = self.augment_data(test_data) tokenizer = tokenizers.SpacyTokenizer() tokenizer.fit_on_texts(all_sentences(test_data)) test_data, _ = self.preprocess_data(test_data, tokenizer, "Predict", oversample=False) embeddings_matrix = Cerebro.build_embeddings_matrix(tokenizer) num_words = len(tokenizer.word_counts()) assert (embeddings_matrix.shape[0] == num_words + 1) model = self.define_model(embeddings_matrix, scope="test") if weights_file is None: weights_file = pick_best_model_from_dir() if DEBUG: print("Best model detected: {}".format(weights_file)) model.load_weights(weights_file, by_name=True) model.summary() num_tests = len(ids) * 4 y = model.predict(test_data) assert (y.shape[0] == num_tests) assert (num_tests % 4 == 0) rez = [] for i in range(0, num_tests, 4): predicted = y[i:i + 4, 1].tolist() scores = [np.exp(2.0 * x) for x in predicted] scores = [1.0 * x / sum(scores) for x in scores] assert (len(scores) == 4) assert (np.allclose(sum(scores), 1.0)) rez = rez + scores if SHOW_PER_SYSTEM_STATS: show_per_system_stats(test_data) assert (isinstance(rez, list)) return rez
def output_cvs_predictions(self, test_data, weights_file=None): assert (isinstance(test_data, list)) assert (isinstance(weights_file, str) or weights_file is None) # Extract ids. ids = [] for entry in test_data: ids.append(entry["id"]) test_data = self.augment_data(test_data) if DEBUG: print_data_stats(test_data, "Binary accuracy") tokenizer = tokenizers.SpacyTokenizer() tokenizer.fit_on_texts(all_sentences(test_data)) if DEBUG: print("Num words: {}\n".format(len(tokenizer.word_counts()))) test_data, test_labels = self.preprocess_data(test_data, tokenizer, "Binary acc", oversample=False) embeddings_matrix = Cerebro.build_embeddings_matrix(tokenizer) num_words = len(tokenizer.word_counts()) assert (embeddings_matrix.shape[0] == num_words + 1) model = self.define_model(embeddings_matrix, scope="test") if weights_file is None: weights_file = pick_best_model_from_dir() if DEBUG: print("Best model detected: {}".format(weights_file)) model.load_weights(weights_file, by_name=True) model.summary() num_tests = test_labels.shape[0] y = model.predict(test_data) assert (y.shape[0] == num_tests) assert (num_tests % 4 == 0) assert (num_tests == 4 * len(ids)) rez = {} for i in range(0, num_tests, 4): predicted = y[i:i + 4, 1] predicted = np.argmax(predicted) rez[ids[i >> 2]] = predicted # Some questions in the ARC corpus expect 1,2,3,4 instead # of A,B,C,D. Look for their ids and make sure we print # in the desired format. # NYSEDREGENTS_* want 1,2,3,4 want_digit = { "NYSEDREGENTS_2015_8_28", "NYSEDREGENTS_2015_8_21", "NYSEDREGENTS_2010_8_2", "NYSEDREGENTS_2010_8_13", "NYSEDREGENTS_2010_8_14", "NYSEDREGENTS_2015_8_24", "NYSEDREGENTS_2013_8_12", "NYSEDREGENTS_2008_8_15", "NYSEDREGENTS_2012_8_5", "NYSEDREGENTS_2013_8_9", "NYSEDREGENTS_2012_8_9", "NYSEDREGENTS_2015_8_33", "NYSEDREGENTS_2010_8_12", "NYSEDREGENTS_2008_8_10", "NYSEDREGENTS_2015_8_2", "NYSEDREGENTS_2012_8_26", "NYSEDREGENTS_2015_8_20", "NYSEDREGENTS_2013_8_27", "NYSEDREGENTS_2013_8_36", "NYSEDREGENTS_2012_8_6", "NYSEDREGENTS_2010_8_34", "NYSEDREGENTS_2012_8_27", "NYSEDREGENTS_2015_8_31", "NYSEDREGENTS_2010_8_9", "NYSEDREGENTS_2015_8_45", "NYSEDREGENTS_2010_8_28", "NYSEDREGENTS_2008_8_24", "NYSEDREGENTS_2012_8_3", "NYSEDREGENTS_2010_8_30", "NYSEDREGENTS_2010_8_15", "NYSEDREGENTS_2015_8_19", "NYSEDREGENTS_2010_8_7", "NYSEDREGENTS_2013_8_16", "NYSEDREGENTS_2013_8_43", "NYSEDREGENTS_2013_8_23", "NYSEDREGENTS_2013_8_13", "NYSEDREGENTS_2013_8_8", "NYSEDREGENTS_2015_8_25", "NYSEDREGENTS_2008_8_33", "NYSEDREGENTS_2010_8_8", "NYSEDREGENTS_2008_8_18", "NYSEDREGENTS_2015_8_1", "NYSEDREGENTS_2008_8_26", "NYSEDREGENTS_2015_8_34", "NYSEDREGENTS_2010_8_6", "NYSEDREGENTS_2013_8_19", "NYSEDREGENTS_2013_8_7", "NYSEDREGENTS_2010_8_31", "NYSEDREGENTS_2013_8_40", "NYSEDREGENTS_2013_8_11", "NYSEDREGENTS_2015_8_8", "NYSEDREGENTS_2013_8_35", "NYSEDREGENTS_2013_8_21", "NYSEDREGENTS_2008_8_37", "NYSEDREGENTS_2015_8_30", "NYSEDREGENTS_2015_8_32", "NYSEDREGENTS_2008_8_2", "NYSEDREGENTS_2008_8_12", "NYSEDREGENTS_2015_8_6", "NYSEDREGENTS_2013_8_22", "NYSEDREGENTS_2012_8_31", "NYSEDREGENTS_2012_8_30", "NYSEDREGENTS_2012_8_15", "NYSEDREGENTS_2012_8_13", "NYSEDREGENTS_2008_8_16", "NYSEDREGENTS_2013_8_14", "NYSEDREGENTS_2010_8_27", "NYSEDREGENTS_2013_8_37", "NYSEDREGENTS_2013_8_5", "NYSEDREGENTS_2013_8_41", "NYSEDREGENTS_2008_8_28", "NYSEDREGENTS_2015_8_5", "NYSEDREGENTS_2013_8_6", "NYSEDREGENTS_2015_8_16", "NYSEDREGENTS_2012_8_18", "NYSEDREGENTS_2012_8_17", "NYSEDREGENTS_2015_8_26", "NYSEDREGENTS_2012_8_11", "NYSEDREGENTS_2008_8_14", "NYSEDREGENTS_2012_8_43", "NYSEDREGENTS_2015_8_35", "NYSEDREGENTS_2012_8_32", "NYSEDREGENTS_2010_8_18", "NYSEDREGENTS_2010_8_41", "NYSEDREGENTS_2012_8_16", "NYSEDREGENTS_2008_8_25", "NYSEDREGENTS_2012_8_40", "NYSEDREGENTS_2013_8_26", "NYSEDREGENTS_2008_8_4", "NYSEDREGENTS_2010_8_32", "NYSEDREGENTS_2008_8_7", "NYSEDREGENTS_2012_8_12", "NYSEDREGENTS_2015_8_22", "NYSEDREGENTS_2012_8_14", "NYSEDREGENTS_2008_8_29", "NYSEDREGENTS_2010_8_17", "NYSEDREGENTS_2010_8_39" } rez = list(rez.items()) # rez.sort(key=lambda x: x[0]) with open("predict.csv", "w") as g: for x, y in rez: assert (y in [0, 1, 2, 3]) if x in want_digit: g.write("{},{}\n".format(x, y + 1)) else: g.write("{},{}\n".format(x, chr(ord('A') + y))) g.flush()
def train(self, train_data, val_data, test_data): assert (isinstance(train_data, list)) assert (isinstance(val_data, list)) assert (isinstance(test_data, list)) # train_data = train_data[0:50] # val_data = val_data[0:5] # test_data = test_data[0:5] all_data = self.augment_data(train_data + val_data + test_data) train_data = all_data[0:len(train_data)] val_data = all_data[len(train_data):len(train_data) + len(val_data)] test_data = all_data[len(train_data) + len(val_data):] assert (len(train_data + val_data + test_data) == len(all_data)) if DEBUG: print_data_stats(train_data, "Train") print_data_stats(val_data, "Val") print_data_stats(test_data, "Test") # Fit a tokenizer on all data. Each word gets assigned a number # between 1 and num_words. tokenizer = tokenizers.SpacyTokenizer() tokenizer.fit_on_texts( all_sentences(train_data) + all_sentences(val_data) + all_sentences(test_data)) if DEBUG: print("Num words: {}\n".format(len(tokenizer.word_counts()))) train_data, train_labels = self.preprocess_data(train_data, tokenizer, "train", oversample=True) val_data, val_labels, = self.preprocess_data(val_data, tokenizer, "val", oversample=True) test_data, test_labels = self.preprocess_data(test_data, tokenizer, "test", oversample=True) embeddings_matrix = Cerebro.build_embeddings_matrix(tokenizer) num_words = len(tokenizer.word_counts()) assert (embeddings_matrix.shape[0] == num_words + 1) model = self.define_model(embeddings_matrix, scope="train") model.summary() filepath = "models/" + "model.{val_acc:.3f}-{epoch:03d}.hdf5" checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=0, mode='max', save_best_only=True, save_weights_only=True) model.fit(train_data, train_labels, batch_size=random.randint(50, 1000), epochs=300, verbose=2, validation_data=(val_data, val_labels), callbacks=[checkpoint], shuffle=True) score = model.evaluate(test_data, test_labels, verbose=0) if score: print('Test loss:', score[0]) print('Test accuracy:', score[1])