def predict(self, test_data, weights_file=None):
        assert (isinstance(test_data, list))
        assert (isinstance(weights_file, str) or weights_file is None)

        # Extract ids.
        ids = []
        for entry in test_data:
            ids.append(entry["id"])

        # Augment data.
        test_data = self.augment_data(test_data)

        if DEBUG:
            print_data_stats(test_data, "Binary accuracy")

        tokenizer = tokenizers.SpacyTokenizer()
        tokenizer.fit_on_texts(all_sentences(test_data))
        if DEBUG:
            print("Num words: {}\n".format(len(tokenizer.word_counts())))

        test_data, _ = self.preprocess_data(test_data,
                                            tokenizer,
                                            "Predict",
                                            oversample=False)

        embeddings_matrix = Cerebro.build_embeddings_matrix(tokenizer)
        num_words = len(tokenizer.word_counts())
        assert (embeddings_matrix.shape[0] == num_words + 1)

        model = self.define_model(embeddings_matrix, scope="test")
        if weights_file is None:
            weights_file = pick_best_model_from_dir()
            if DEBUG:
                print("Best model detected: {}".format(weights_file))
        model.load_weights(weights_file, by_name=True)
        model.summary()

        num_tests = len(ids) * 4
        y = model.predict(test_data)
        assert (y.shape[0] == num_tests)
        assert (num_tests % 4 == 0)

        total = 0
        correct_answers = []
        for i in range(0, num_tests, 4):
            predicted = y[i:i + 4, 1]
            predicted = np.argmax(predicted)
            correct_answers.append(predicted)
            total += 1

        assert (total == len(correct_answers))
        assert (len(ids) == len(correct_answers))
        assert (total == num_tests / 4)

        rez = list(zip(ids, correct_answers))
        rez = sorted(rez, key=lambda x: x[0])
        return rez
    def test_4way(self, test_data, weights_file=None):
        assert (isinstance(test_data, list))
        assert (isinstance(weights_file, str) or weights_file is None)

        test_data = self.augment_data(test_data)

        if DEBUG:
            print_data_stats(test_data, "Binary accuracy")

        tokenizer = tokenizers.SpacyTokenizer()
        tokenizer.fit_on_texts(all_sentences(test_data))
        if DEBUG:
            print("Num words: {}\n".format(len(tokenizer.word_counts())))

        test_data, test_labels = self.preprocess_data(test_data,
                                                      tokenizer,
                                                      "Binary acc",
                                                      oversample=False)

        embeddings_matrix = Cerebro.build_embeddings_matrix(tokenizer)
        num_words = len(tokenizer.word_counts())
        assert (embeddings_matrix.shape[0] == num_words + 1)

        model = self.define_model(embeddings_matrix, scope="test")
        if weights_file is None:
            weights_file = pick_best_model_from_dir()
            if DEBUG:
                print("Best model detected: {}".format(weights_file))
        model.load_weights(weights_file, by_name=True)
        model.summary()

        num_tests = test_labels.shape[0]
        y = model.predict(test_data)
        assert (y.shape[0] == num_tests)
        assert (num_tests % 4 == 0)

        correct = 0
        total = 0
        for i in range(0, num_tests, 4):
            expected = test_labels[i:i + 4, 1]
            assert (np.allclose(np.sum(expected), 1.0))
            expected = np.argmax(expected)

            predicted = y[i:i + 4, 1]
            predicted = np.argmax(predicted)

            if predicted == expected:
                correct += 1
            total += 1

        assert (total == num_tests / 4)
        if total == 0:
            total = 1
        print("\nEvaluated on {} questions.".format(total))
        print("Accuracy: {0:.3f}%".format(100.0 * correct / total))
    def print_diff(self, data, weights_file=None):
        assert (isinstance(data, list))
        assert (isinstance(weights_file, str) or weights_file is None)

        data = self.augment_data(data)

        if DEBUG:
            print_data_stats(data, "Print Diff")

        tokenizer = tokenizers.SpacyTokenizer()
        tokenizer.fit_on_texts(all_sentences(data))
        if DEBUG:
            print("Num words: {}\n".format(len(tokenizer.word_counts())))

        test_data, test_labels = self.preprocess_data(data,
                                                      tokenizer,
                                                      "Print Diff",
                                                      oversample=False)

        embeddings_matrix = Cerebro.build_embeddings_matrix(tokenizer)
        num_words = len(tokenizer.word_counts())
        assert (embeddings_matrix.shape[0] == num_words + 1)

        model = self.define_model(embeddings_matrix, scope="test")
        if weights_file is None:
            weights_file = pick_best_model_from_dir()
            if DEBUG:
                print("Best model detected: {}".format(weights_file))
        model.load_weights(weights_file, by_name=True)
        model.summary()

        num_tests = test_labels.shape[0]
        y = model.predict(test_data)
        assert (y.shape[0] == num_tests)
        assert (num_tests % 4 == 0)

        for i in range(0, num_tests, 4):
            expected = test_labels[i:i + 4, 1]
            assert (np.allclose(np.sum(expected), 1.0))
            expected = np.argmax(expected)

            predicted = y[i:i + 4, 1]
            predicted = np.argmax(predicted)

            if predicted == expected:
                entry = data[int(i / 4)]
                question_text = entry["question"]
                tf_idf_scores = [x['tfIdfScore'] for x in entry["answers"]]
                assert (len(tf_idf_scores) == 4)
                assert (abs(sum(tf_idf_scores) - 1.0) <= 0.001)

                if np.argmax(tf_idf_scores) != predicted:
                    print(question_text)
    def predict_batch(self, test_data, weights_file=None):
        assert (isinstance(test_data, list))
        assert (isinstance(weights_file, str) or weights_file is None)

        # Extract ids.
        ids = []
        for entry in test_data:
            ids.append(entry["id"])

        # Augment data.
        test_data = self.augment_data(test_data)

        tokenizer = tokenizers.SpacyTokenizer()
        tokenizer.fit_on_texts(all_sentences(test_data))

        test_data, _ = self.preprocess_data(test_data,
                                            tokenizer,
                                            "Predict",
                                            oversample=False)

        embeddings_matrix = Cerebro.build_embeddings_matrix(tokenizer)
        num_words = len(tokenizer.word_counts())
        assert (embeddings_matrix.shape[0] == num_words + 1)

        model = self.define_model(embeddings_matrix, scope="test")
        if weights_file is None:
            weights_file = pick_best_model_from_dir()
            if DEBUG:
                print("Best model detected: {}".format(weights_file))
        model.load_weights(weights_file, by_name=True)
        model.summary()

        num_tests = len(ids) * 4
        y = model.predict(test_data)
        assert (y.shape[0] == num_tests)
        assert (num_tests % 4 == 0)

        rez = []
        for i in range(0, num_tests, 4):
            predicted = y[i:i + 4, 1].tolist()
            scores = [np.exp(2.0 * x) for x in predicted]
            scores = [1.0 * x / sum(scores) for x in scores]
            assert (len(scores) == 4)
            assert (np.allclose(sum(scores), 1.0))
            rez = rez + scores

        if SHOW_PER_SYSTEM_STATS:
            show_per_system_stats(test_data)

        assert (isinstance(rez, list))
        return rez
    def output_cvs_predictions(self, test_data, weights_file=None):
        assert (isinstance(test_data, list))
        assert (isinstance(weights_file, str) or weights_file is None)

        # Extract ids.
        ids = []
        for entry in test_data:
            ids.append(entry["id"])

        test_data = self.augment_data(test_data)

        if DEBUG:
            print_data_stats(test_data, "Binary accuracy")

        tokenizer = tokenizers.SpacyTokenizer()
        tokenizer.fit_on_texts(all_sentences(test_data))
        if DEBUG:
            print("Num words: {}\n".format(len(tokenizer.word_counts())))

        test_data, test_labels = self.preprocess_data(test_data,
                                                      tokenizer,
                                                      "Binary acc",
                                                      oversample=False)

        embeddings_matrix = Cerebro.build_embeddings_matrix(tokenizer)
        num_words = len(tokenizer.word_counts())
        assert (embeddings_matrix.shape[0] == num_words + 1)

        model = self.define_model(embeddings_matrix, scope="test")
        if weights_file is None:
            weights_file = pick_best_model_from_dir()
            if DEBUG:
                print("Best model detected: {}".format(weights_file))
        model.load_weights(weights_file, by_name=True)
        model.summary()

        num_tests = test_labels.shape[0]
        y = model.predict(test_data)
        assert (y.shape[0] == num_tests)
        assert (num_tests % 4 == 0)
        assert (num_tests == 4 * len(ids))

        rez = {}
        for i in range(0, num_tests, 4):
            predicted = y[i:i + 4, 1]
            predicted = np.argmax(predicted)
            rez[ids[i >> 2]] = predicted

        # Some questions in the ARC corpus expect 1,2,3,4 instead
        # of A,B,C,D. Look for their ids and make sure we print
        # in the desired format.
        # NYSEDREGENTS_* want 1,2,3,4
        want_digit = {
            "NYSEDREGENTS_2015_8_28", "NYSEDREGENTS_2015_8_21",
            "NYSEDREGENTS_2010_8_2", "NYSEDREGENTS_2010_8_13",
            "NYSEDREGENTS_2010_8_14", "NYSEDREGENTS_2015_8_24",
            "NYSEDREGENTS_2013_8_12", "NYSEDREGENTS_2008_8_15",
            "NYSEDREGENTS_2012_8_5", "NYSEDREGENTS_2013_8_9",
            "NYSEDREGENTS_2012_8_9", "NYSEDREGENTS_2015_8_33",
            "NYSEDREGENTS_2010_8_12", "NYSEDREGENTS_2008_8_10",
            "NYSEDREGENTS_2015_8_2", "NYSEDREGENTS_2012_8_26",
            "NYSEDREGENTS_2015_8_20", "NYSEDREGENTS_2013_8_27",
            "NYSEDREGENTS_2013_8_36", "NYSEDREGENTS_2012_8_6",
            "NYSEDREGENTS_2010_8_34", "NYSEDREGENTS_2012_8_27",
            "NYSEDREGENTS_2015_8_31", "NYSEDREGENTS_2010_8_9",
            "NYSEDREGENTS_2015_8_45", "NYSEDREGENTS_2010_8_28",
            "NYSEDREGENTS_2008_8_24", "NYSEDREGENTS_2012_8_3",
            "NYSEDREGENTS_2010_8_30", "NYSEDREGENTS_2010_8_15",
            "NYSEDREGENTS_2015_8_19", "NYSEDREGENTS_2010_8_7",
            "NYSEDREGENTS_2013_8_16", "NYSEDREGENTS_2013_8_43",
            "NYSEDREGENTS_2013_8_23", "NYSEDREGENTS_2013_8_13",
            "NYSEDREGENTS_2013_8_8", "NYSEDREGENTS_2015_8_25",
            "NYSEDREGENTS_2008_8_33", "NYSEDREGENTS_2010_8_8",
            "NYSEDREGENTS_2008_8_18", "NYSEDREGENTS_2015_8_1",
            "NYSEDREGENTS_2008_8_26", "NYSEDREGENTS_2015_8_34",
            "NYSEDREGENTS_2010_8_6", "NYSEDREGENTS_2013_8_19",
            "NYSEDREGENTS_2013_8_7", "NYSEDREGENTS_2010_8_31",
            "NYSEDREGENTS_2013_8_40", "NYSEDREGENTS_2013_8_11",
            "NYSEDREGENTS_2015_8_8", "NYSEDREGENTS_2013_8_35",
            "NYSEDREGENTS_2013_8_21", "NYSEDREGENTS_2008_8_37",
            "NYSEDREGENTS_2015_8_30", "NYSEDREGENTS_2015_8_32",
            "NYSEDREGENTS_2008_8_2", "NYSEDREGENTS_2008_8_12",
            "NYSEDREGENTS_2015_8_6", "NYSEDREGENTS_2013_8_22",
            "NYSEDREGENTS_2012_8_31", "NYSEDREGENTS_2012_8_30",
            "NYSEDREGENTS_2012_8_15", "NYSEDREGENTS_2012_8_13",
            "NYSEDREGENTS_2008_8_16", "NYSEDREGENTS_2013_8_14",
            "NYSEDREGENTS_2010_8_27", "NYSEDREGENTS_2013_8_37",
            "NYSEDREGENTS_2013_8_5", "NYSEDREGENTS_2013_8_41",
            "NYSEDREGENTS_2008_8_28", "NYSEDREGENTS_2015_8_5",
            "NYSEDREGENTS_2013_8_6", "NYSEDREGENTS_2015_8_16",
            "NYSEDREGENTS_2012_8_18", "NYSEDREGENTS_2012_8_17",
            "NYSEDREGENTS_2015_8_26", "NYSEDREGENTS_2012_8_11",
            "NYSEDREGENTS_2008_8_14", "NYSEDREGENTS_2012_8_43",
            "NYSEDREGENTS_2015_8_35", "NYSEDREGENTS_2012_8_32",
            "NYSEDREGENTS_2010_8_18", "NYSEDREGENTS_2010_8_41",
            "NYSEDREGENTS_2012_8_16", "NYSEDREGENTS_2008_8_25",
            "NYSEDREGENTS_2012_8_40", "NYSEDREGENTS_2013_8_26",
            "NYSEDREGENTS_2008_8_4", "NYSEDREGENTS_2010_8_32",
            "NYSEDREGENTS_2008_8_7", "NYSEDREGENTS_2012_8_12",
            "NYSEDREGENTS_2015_8_22", "NYSEDREGENTS_2012_8_14",
            "NYSEDREGENTS_2008_8_29", "NYSEDREGENTS_2010_8_17",
            "NYSEDREGENTS_2010_8_39"
        }
        rez = list(rez.items())
        # rez.sort(key=lambda x: x[0])
        with open("predict.csv", "w") as g:
            for x, y in rez:
                assert (y in [0, 1, 2, 3])
                if x in want_digit:
                    g.write("{},{}\n".format(x, y + 1))
                else:
                    g.write("{},{}\n".format(x, chr(ord('A') + y)))
            g.flush()
    def train(self, train_data, val_data, test_data):
        assert (isinstance(train_data, list))
        assert (isinstance(val_data, list))
        assert (isinstance(test_data, list))

        # train_data = train_data[0:50]
        # val_data = val_data[0:5]
        # test_data = test_data[0:5]

        all_data = self.augment_data(train_data + val_data + test_data)
        train_data = all_data[0:len(train_data)]
        val_data = all_data[len(train_data):len(train_data) + len(val_data)]
        test_data = all_data[len(train_data) + len(val_data):]
        assert (len(train_data + val_data + test_data) == len(all_data))

        if DEBUG:
            print_data_stats(train_data, "Train")
            print_data_stats(val_data, "Val")
            print_data_stats(test_data, "Test")

        # Fit a tokenizer on all data. Each word gets assigned a number
        # between 1 and num_words.
        tokenizer = tokenizers.SpacyTokenizer()
        tokenizer.fit_on_texts(
            all_sentences(train_data) + all_sentences(val_data) +
            all_sentences(test_data))
        if DEBUG:
            print("Num words: {}\n".format(len(tokenizer.word_counts())))

        train_data, train_labels = self.preprocess_data(train_data,
                                                        tokenizer,
                                                        "train",
                                                        oversample=True)
        val_data, val_labels, = self.preprocess_data(val_data,
                                                     tokenizer,
                                                     "val",
                                                     oversample=True)
        test_data, test_labels = self.preprocess_data(test_data,
                                                      tokenizer,
                                                      "test",
                                                      oversample=True)

        embeddings_matrix = Cerebro.build_embeddings_matrix(tokenizer)
        num_words = len(tokenizer.word_counts())
        assert (embeddings_matrix.shape[0] == num_words + 1)

        model = self.define_model(embeddings_matrix, scope="train")
        model.summary()

        filepath = "models/" + "model.{val_acc:.3f}-{epoch:03d}.hdf5"
        checkpoint = ModelCheckpoint(filepath,
                                     monitor='val_acc',
                                     verbose=0,
                                     mode='max',
                                     save_best_only=True,
                                     save_weights_only=True)
        model.fit(train_data,
                  train_labels,
                  batch_size=random.randint(50, 1000),
                  epochs=300,
                  verbose=2,
                  validation_data=(val_data, val_labels),
                  callbacks=[checkpoint],
                  shuffle=True)
        score = model.evaluate(test_data, test_labels, verbose=0)
        if score:
            print('Test loss:', score[0])
            print('Test accuracy:', score[1])