def headquarter_three_class_classifier(jobs=30, input_file=None, output_dir=None): if (input_file is None) or (output_dir is None): raise ValueError elif not os.path.isfile(input_file): raise ValueError(input_file) data = pandas.read_csv(input_file) data = data[["Classification"] + general.whole_values] result_data = list() original_class = list(data["Classification"]) for selected_class in general.three_class_combinations: data["Classification"] = list( map( lambda x: "+".join(selected_class) if x in selected_class else x, data["Classification"])) train_data, test_data = sklearn.model_selection.train_test_split( data, test_size=0.1, random_state=0, stratify=data["Classification"]) with multiprocessing.Pool(processes=jobs) as pool: for name, classifier in classifiers: results = [("Number", "balanced_accuracy_score") + general.aggregate_confusion_matrix(None)] results += pool.starmap( actual_three_class_classifier, [(classifier, train_data.copy(), test_data.copy(), os.path.join(output_dir, name), i, general.class_to_num(selected_class)) for i in range(1, 2**len(general.absolute_values))]) results += pool.starmap( actual_three_class_classifier, [(classifier, train_data.copy(), test_data.copy(), os.path.join(output_dir, name), i * (2**len(general.absolute_values)), general.class_to_num(selected_class)) for i in range(1, 2**len(general.relative_values))]) results = pandas.DataFrame(results[1:], columns=results[0]) results["classifier"] = name results["combined_class"] = "-vs-".join( sorted(set(data["Classification"]))) results.to_csv(general.check_exist( os.path.join(output_dir, name, "-".join(selected_class) + ".csv")), index=False) result_data.append(results.copy()) data["Classification"] = original_class pandas.concat(result_data, ignore_index=True).to_csv(general.check_exist( os.path.join(output_dir, "statistics.csv")), index=False)
def headquarter_ovo_classifier(input_file, output_dir, jobs): if not os.path.isfile(input_file): raise ValueError(input_file) elif jobs < 1: raise ValueError(jobs) data = pandas.read_csv(input_file) data = data[["Classification"] + general.whole_values] result_data = list() for selected_class in general.two_class_combinations: tmp_data = data.loc[(data["Classification"].isin(selected_class))] train_data, test_data = sklearn.model_selection.train_test_split( tmp_data, test_size=0.1, random_state=0, stratify=tmp_data["Classification"]) with multiprocessing.Pool(processes=jobs) as pool: for name, classifier in classifiers: results = [ ("Number", ) + general.aggregate_confusion_matrix(None) ] results += pool.starmap( actual_ovo_classifier, [(classifier, train_data.copy(), test_data.copy(), os.path.join(output_dir, name), i, general.class_to_num(selected_class)) for i in range(1, 2**len(general.absolute_values))]) results += pool.starmap( actual_ovo_classifier, [(classifier, train_data.copy(), test_data.copy(), os.path.join(output_dir, name), i * (2**len(general.absolute_values)), general.class_to_num(selected_class)) for i in range(1, 2**len(general.relative_values))]) results = pandas.DataFrame(results[1:], columns=results[0]) results["classifier"] = name results["combined_class"] = "-vs-".join( sorted(set(tmp_data["Classification"]))) results.to_csv(general.check_exist( os.path.join(output_dir, name, "-".join(selected_class) + ".csv")), index=False) result_data.append(results) pandas.concat(result_data, ignore_index=True).to_csv(general.check_exist( os.path.join(output_dir, "statistics.csv")), index=False)
def draw_extreme(csv_file, output_dir): if not os.path.isfile(csv_file): raise ValueError(csv_file) statistics_data = pandas.read_csv(csv_file) results = [("combined_class", "classifier", "bacteria", "statistics", "type", "value")] for combined_class in sorted(set(statistics_data["combined_class"])): tmp = list(filter(lambda x: "+" in x, combined_class.split("-vs-"))) if tmp: combined_class_num = general.class_to_num(tmp[0].split("+")) else: combined_class_num = 0 for classifier in sorted(set(statistics_data["classifier"])): prediction_directory = os.path.join(os.path.dirname(csv_file), classifier) for statistics_value in general.aggregate_confusion_matrix(None): selected_data = statistics_data.loc[(statistics_data["combined_class"] == combined_class) & (statistics_data["classifier"] == classifier)][[statistics_value, "Number"]] minimum, maximum = selected_data.loc[selected_data.idxmin(axis="index")[statistics_value], "Number"], selected_data.loc[selected_data.idxmax(axis="index")[statistics_value], "Number"] for name, value in zip(["minimum", "maximum"], [minimum, maximum]): if combined_class_num: prediction_data = pandas.read_csv(os.path.join(prediction_directory, "Prediction_%s_%d.csv" % (value, combined_class_num))) else: prediction_data = pandas.read_csv(os.path.join(prediction_directory, "Prediction_%s.csv" % (value))) prediction_data = prediction_data.groupby(list(prediction_data.columns), as_index=False).size().reset_index().rename(columns={0: "counts"}).pivot("prediction", "real", "counts").fillna(0) seaborn.set(context="poster", style="whitegrid") fig, ax = matplotlib.pyplot.subplots(figsize=(24, 24)) seaborn.heatmap(prediction_data, annot=True, ax=ax, robust=True) ax.set_title(combined_class.replace("-", " ") + " with " + statistics_value) fig.savefig(general.check_exist(os.path.join(output_dir, name + "_" + combined_class + "_" + classifier + "_" + statistics_value + ".png"))) matplotlib.pyplot.close(fig) results.append((combined_class, classifier, "+".join(general.num_to_bacteria(value)), statistics_value, name, value)) pandas.DataFrame(results[1:], columns=results[0]).to_csv(general.check_exist(os.path.join(output_dir, "Min_Max.csv")), index=False)