Example #1
0
    mlp_model = MLPLearningModel(name="default",
                                 hidden_layer_sizes=(80, ), max_iter=500)
    training_pdf = args.symbols_file
    labels_file = args.labels_file

    if args.reset and os.path.isdir(state_dir):
        shutil.rmtree(state_dir)

    if not os.path.isfile(training_pdf):
        raise IOError("Symbols file %s not found." % training_pdf)

    if not os.path.isfile(labels_file):
        raise IOError("Labels file %s not found." % labels_file)

    training_file = fe.TrainingFileSpec(
            training_pdf, fe.DEFAULT_GRID_SPEC,
            labels_file, dpi=96, white_threshold=fe.DEFAULT_WHITE_THRESHOLD)
    feature_matrix, feature_labels = \
        FeatureExtractor.extract_labelled_feature_vectors(
                training_file, rotations=np.zeros(1))

    if not os.path.isdir(state_dir):
        mlp_model.train(feature_matrix, feature_labels)
        mlp_model.dump(state_dir)
        print("Model trained. Program finished.")
    else:
        mlp_model.load(state_dir)
        mlp_model.improve(feature_matrix, feature_labels)
        mlp_model.dump(state_dir)
        print("Model improved. Program finished.")
    def run_benchmarks(training_data_dir: str, results_dir: str):
        """
        Runs benchmark tests and records observations.
        :param training_data_dir:   The directory which contains the training
                                    grid files and labels.
        :param results_dir:         The directory into which results should be
                                    written. The results are recorded in a file
                                    named `benchmark.csv` with each model's MCC,
                                    training time, and prediction time recorded.
                                    The method will also produce confusion
                                    matrices for each model benchmarked for
                                    debugging.
        """
        if os.path.isdir(results_dir):
            shutil.rmtree(results_dir)
            os.makedirs(results_dir, exist_ok=True)

        training_pdf = os.path.join(training_data_dir, "training_data.pdf")
        labels_file = os.path.join(training_data_dir,
                                   "training_labels.txt")
        conf_mat_figsize = (14, 14)
        if not os.path.isfile(training_pdf):
            raise IOError("Training input PDF not found.")
        if not os.path.isfile(labels_file):
            raise IOError("Training labels file not found.")

        training_file = fe.TrainingFileSpec(
                training_pdf, fe.DEFAULT_GRID_SPEC,
                labels_file, dpi=96, white_threshold=fe.DEFAULT_WHITE_THRESHOLD)

        angles = np.arange(-5, 5, 1)
        feature_vectors, feature_labels = \
            FeatureExtractor.extract_labelled_feature_vectors(training_file,
                                                              rotations=angles)

        train_ft, test_ft, train_lb, test_lb = train_test_split(feature_vectors,
                                                                feature_labels,
                                                                test_size=0.33,
                                                                random_state=81)
        all_labels = [lbl for lbl in np.unique(feature_labels)]

        # We still store our benchmarks here
        bench = pd.DataFrame(columns=['Model', 'MCC', 'TestTime', 'PredTime'])
        models = []

        # 1 hidden layer with various sizes
        for i in [26, 52, 80, 100]:
            models.append(MLPLearningModel(name="MLP (%i)" % i,
                                           hidden_layer_sizes=(i,),
                                           max_iter=1000))

        # 2 hidden layers with various sizes
        for i in [10, 26, 100]:
            for j in [10, 26, 100]:
                models.append(MLPLearningModel(name="MLP (%i, %i)" % (i, j),
                                               hidden_layer_sizes=(i, j,),
                                               max_iter=1000))

        # KNN models with various k values
        for k in [1, 5, 10, 15, 30]:
            models.append(KNNLearningModel(name="KNN (k=%i)" % k, k=k))

        for model in models:
            begin = time.time()
            model.train(train_ft, train_lb)
            test_time = time.time() - begin

            begin = time.time()
            predictions = model.predict(test_ft)
            pred_time = time.time() - begin

            mcc = matthews_corrcoef(y_pred=predictions, y_true=test_lb)
            bench = bench.append({'Model': model.name,
                                  'MCC': mcc,
                                  'TestTime': test_time,
                                  'PredTime': pred_time}, ignore_index=True)
            conf_mat = confusion_matrix(y_pred=predictions, y_true=test_lb,
                                        normalize='true')

            plot_title = "Confusion matrix for %s" % model.name
            fig = plothelp.plot_confusion_matrix(data=conf_mat,
                                                 title=plot_title,
                                                 figsize=conf_mat_figsize,
                                                 dpi=120,
                                                 labels=all_labels)
            plot_file_name = model.name.replace(" ", "_").replace(",", "_").\
                replace("(", "_").replace(")", "_").replace("=", "_") + ".png"
            fig.savefig(os.path.join(results_dir, plot_file_name))
            pyplot.close(fig)

        bench.to_csv(
                path_or_buf=os.path.join(results_dir, "benchmark.csv"),
                header=True, index=False)