def run(self):
        if os.path.exists(self.output_dir):

            if os.path.exists("%s%s" % (self.output_dir, "_1")):
                shutil.rmtree("%s%s" % (self.output_dir, "_1"))

            shutil.move(self.output_dir, "%s%s" % (self.output_dir, "_1"))

        os.mkdir(self.output_dir)

        for dataset in self.datasets:
            if dataset not in self.data_conf:
                logging.error("Dataset %s not found" % dataset)
                sys.exit(0)

            dataset_dir = os.path.join(self.output_dir, dataset)
            os.mkdir(dataset_dir)

            if self.shall_analyze:
                self.analyze(dataset, dataset_dir)

            for algorithm in self.algorithms:

                algo_dir = os.path.join(dataset_dir, algorithm)
                os.mkdir(algo_dir)


                results = []
                for training_size in self.conf.get('training_sizes', [.4]):

                    data_conf = self.data_conf[dataset]

                    data = self.data_class.load_dataset(dataset, training_size)

                    result = self.run_algorithm(algorithm, data, data_conf, training_size)

                    if self.conf.get('evaluate', True):
                        if self.output == "print":
                            self.print_results(training_size, algorithm, dataset, result)

                        if self.shall_plot:
                            for metric, y_test, score in result:
                                metric_plot_path = os.path.join(algo_dir, "metric-%s-%s_%s_size_%d.png" % (metric, dataset, algorithm, training_size * 100))
                                plot_metric(data['type'], y_test, data['y_test'], dataset, algorithm, training_size * 100, metric_plot_path)
                    else:
                        result_file = open(os.path.join(algo_dir, "result.csv"), 'a+')
                        result_file.write(",".join(results))
                        result_file.close()
Ejemplo n.º 2
0
def run_algorithms(algorithms, datasets, metrics, output, conf):
    dts = Datasets()
    shall_plot = conf.get("plot_data")
    if shall_plot:
        plot_dir = conf.get("plot_dir", "../plots")

        tmp_plot_dir = "../plots_1"
        if os.path.exists(tmp_plot_dir):
            shutil.rmtree(tmp_plot_dir)

        os.mkdir(tmp_plot_dir)

        orig_data_dir = os.path.join(tmp_plot_dir, "original")
        os.mkdir(orig_data_dir)
        for dataset in datasets:
            plot_data(os.path.join(orig_data_dir, "%s-orig.png"  % dataset), "%s-orig" % dataset, dataset)

    if output == 'dump_text' and not os.path.exists("../dumps"):
        os.mkdir("../dumps")

    for algorithm in algorithms:

        if shall_plot:
            algo_dir = os.path.join(tmp_plot_dir, algorithm)
            os.mkdir(algo_dir)

        algo_conf = conf["algorithms"].get(algorithm, None)

        if not algo_conf:
            logging.error("Algorithm %s not found in conf file" % algorithm)
            sys.exit(0)

        algo_conf['name'] = algorithm
        learn_class = _get_algorithm_class(algorithm)
        learn = learn_class(**algo_conf)
        learn._set_cross_validation(conf.get("cv_method", None), conf.get("cv_metric", None), conf.get("cv_params", None))
        results = []
        for dataset in datasets:
            if dataset not in conf["datasets"]:
                logging.error("Dataset %s not found" % dataset)
                sys.exit(0)

            cv_dir = None
            if shall_plot:
                dataset_dir = os.path.join(algo_dir, dataset)
                os.mkdir(dataset_dir)

                if algo_conf.get("cross_validate", True):
                    cv_dir = os.path.join(dataset_dir, "cv")
                    os.mkdir(cv_dir)

            training_sizes = conf.get("training_size", [0.40])
            scores = []
            for training_size in training_sizes:
                data = dts.load_dataset(dataset, training_size)

                learn.set_dataset(dataset, training_size*100, cv_dir)
                if learn.check_type(data["type"]):
                    eval_metrics = []
                    if metrics:
                        eval_metrics.extend(metrics)
                    else:
                        eval_metrics.extend(algo_conf["allowed_metrics"])

                    learn.train(data["x_train"], data["y_train"])
                    result_tups = learn.evaluate(data["x_test"], data["y_test"], eval_metrics)

                    print_results(training_size, algorithm, dataset, result_tups)
                    results.append((algorithm, dataset, training_size, result_tups))

                    if shall_plot:
                        decision_plot_path = os.path.join(dataset_dir, "decision-%s_%s_size_%d.png" % (dataset, algorithm, training_size * 100))
                        learn.plot_results(decision_plot_path, dataset, training_size, data['x_train'], data['x_test'], data['y_train'], data['y_test'])

                        for metric, y_test, score in result_tups:
                            metric_plot_path = os.path.join(dataset_dir, "metric-%s-%s_%s_size_%d.png" % (metric, dataset, algorithm, training_size * 100))
                            plot_metric(metric_plot_path, data['type'], y_test, data['y_test'], dataset, algorithm, training_size * 100)
                    scores.append(result_tups[0][2])
            if shall_plot:
                train_plot_path = os.path.join(dataset_dir, "train_vs_acc-%s_%s.png" % (algorithm, dataset))
                plot_training_results(train_plot_path, [train_size * 100 for train_size in training_sizes], scores)

        if output == "pdf":
            generate_pdf(results)
        elif output == "dump_text":
            dump_results(algorithm, results)
    if conf.get("plot_data", False):
        shutil.rmtree(plot_dir)
        shutil.move(tmp_plot_dir, plot_dir)