def run(self): if os.path.exists(self.output_dir): if os.path.exists("%s%s" % (self.output_dir, "_1")): shutil.rmtree("%s%s" % (self.output_dir, "_1")) shutil.move(self.output_dir, "%s%s" % (self.output_dir, "_1")) os.mkdir(self.output_dir) for dataset in self.datasets: if dataset not in self.data_conf: logging.error("Dataset %s not found" % dataset) sys.exit(0) dataset_dir = os.path.join(self.output_dir, dataset) os.mkdir(dataset_dir) if self.shall_analyze: self.analyze(dataset, dataset_dir) for algorithm in self.algorithms: algo_dir = os.path.join(dataset_dir, algorithm) os.mkdir(algo_dir) results = [] for training_size in self.conf.get('training_sizes', [.4]): data_conf = self.data_conf[dataset] data = self.data_class.load_dataset(dataset, training_size) result = self.run_algorithm(algorithm, data, data_conf, training_size) if self.conf.get('evaluate', True): if self.output == "print": self.print_results(training_size, algorithm, dataset, result) if self.shall_plot: for metric, y_test, score in result: metric_plot_path = os.path.join(algo_dir, "metric-%s-%s_%s_size_%d.png" % (metric, dataset, algorithm, training_size * 100)) plot_metric(data['type'], y_test, data['y_test'], dataset, algorithm, training_size * 100, metric_plot_path) else: result_file = open(os.path.join(algo_dir, "result.csv"), 'a+') result_file.write(",".join(results)) result_file.close()
def run_algorithms(algorithms, datasets, metrics, output, conf): dts = Datasets() shall_plot = conf.get("plot_data") if shall_plot: plot_dir = conf.get("plot_dir", "../plots") tmp_plot_dir = "../plots_1" if os.path.exists(tmp_plot_dir): shutil.rmtree(tmp_plot_dir) os.mkdir(tmp_plot_dir) orig_data_dir = os.path.join(tmp_plot_dir, "original") os.mkdir(orig_data_dir) for dataset in datasets: plot_data(os.path.join(orig_data_dir, "%s-orig.png" % dataset), "%s-orig" % dataset, dataset) if output == 'dump_text' and not os.path.exists("../dumps"): os.mkdir("../dumps") for algorithm in algorithms: if shall_plot: algo_dir = os.path.join(tmp_plot_dir, algorithm) os.mkdir(algo_dir) algo_conf = conf["algorithms"].get(algorithm, None) if not algo_conf: logging.error("Algorithm %s not found in conf file" % algorithm) sys.exit(0) algo_conf['name'] = algorithm learn_class = _get_algorithm_class(algorithm) learn = learn_class(**algo_conf) learn._set_cross_validation(conf.get("cv_method", None), conf.get("cv_metric", None), conf.get("cv_params", None)) results = [] for dataset in datasets: if dataset not in conf["datasets"]: logging.error("Dataset %s not found" % dataset) sys.exit(0) cv_dir = None if shall_plot: dataset_dir = os.path.join(algo_dir, dataset) os.mkdir(dataset_dir) if algo_conf.get("cross_validate", True): cv_dir = os.path.join(dataset_dir, "cv") os.mkdir(cv_dir) training_sizes = conf.get("training_size", [0.40]) scores = [] for training_size in training_sizes: data = dts.load_dataset(dataset, training_size) learn.set_dataset(dataset, training_size*100, cv_dir) if learn.check_type(data["type"]): eval_metrics = [] if metrics: eval_metrics.extend(metrics) else: eval_metrics.extend(algo_conf["allowed_metrics"]) learn.train(data["x_train"], data["y_train"]) result_tups = learn.evaluate(data["x_test"], data["y_test"], eval_metrics) print_results(training_size, algorithm, dataset, result_tups) results.append((algorithm, dataset, training_size, result_tups)) if shall_plot: decision_plot_path = os.path.join(dataset_dir, "decision-%s_%s_size_%d.png" % (dataset, algorithm, training_size * 100)) learn.plot_results(decision_plot_path, dataset, training_size, data['x_train'], data['x_test'], data['y_train'], data['y_test']) for metric, y_test, score in result_tups: metric_plot_path = os.path.join(dataset_dir, "metric-%s-%s_%s_size_%d.png" % (metric, dataset, algorithm, training_size * 100)) plot_metric(metric_plot_path, data['type'], y_test, data['y_test'], dataset, algorithm, training_size * 100) scores.append(result_tups[0][2]) if shall_plot: train_plot_path = os.path.join(dataset_dir, "train_vs_acc-%s_%s.png" % (algorithm, dataset)) plot_training_results(train_plot_path, [train_size * 100 for train_size in training_sizes], scores) if output == "pdf": generate_pdf(results) elif output == "dump_text": dump_results(algorithm, results) if conf.get("plot_data", False): shutil.rmtree(plot_dir) shutil.move(tmp_plot_dir, plot_dir)