def actual_ovo_classifier(classifier, train_data, test_data, output_dir,
                          bacteria_num, class_num):
    train_answer = train_data.pop("Classification")
    test_answer = test_data.pop("Classification")

    train_data = train_data[general.num_to_bacteria(bacteria_num)]
    test_data = test_data[general.num_to_bacteria(bacteria_num)]

    classifier.fit(train_data, train_answer)

    pandas.DataFrame(classifier.predict_proba(test_data),
                     columns=sorted(set(test_answer))).to_csv(
                         general.check_exist(
                             os.path.join(
                                 output_dir,
                                 "Probability_" + str(bacteria_num) + "_" +
                                 str(class_num) + ".csv")),
                         index=False)

    prediction = classifier.predict(test_data)
    pandas.DataFrame(zip(test_answer, prediction),
                     columns=["real",
                              "prediction"]).to_csv(general.check_exist(
                                  os.path.join(
                                      output_dir,
                                      "Prediction_" + str(bacteria_num) + "_" +
                                      str(class_num) + ".csv")),
                                                    index=False)
    return (bacteria_num, ) + general.aggregate_confusion_matrix(
        numpy.sum(sklearn.metrics.multilabel_confusion_matrix(
            test_answer, prediction),
                  axis=0,
                  dtype=int))
Ejemplo n.º 2
0
def headquarter_three_class_classifier(jobs=30,
                                       input_file=None,
                                       output_dir=None):
    if (input_file is None) or (output_dir is None):
        raise ValueError
    elif not os.path.isfile(input_file):
        raise ValueError(input_file)

    data = pandas.read_csv(input_file)
    data = data[["Classification"] + general.whole_values]
    result_data = list()

    original_class = list(data["Classification"])

    for selected_class in general.three_class_combinations:
        data["Classification"] = list(
            map(
                lambda x: "+".join(selected_class)
                if x in selected_class else x, data["Classification"]))
        train_data, test_data = sklearn.model_selection.train_test_split(
            data,
            test_size=0.1,
            random_state=0,
            stratify=data["Classification"])

        with multiprocessing.Pool(processes=jobs) as pool:
            for name, classifier in classifiers:
                results = [("Number", "balanced_accuracy_score") +
                           general.aggregate_confusion_matrix(None)]

                results += pool.starmap(
                    actual_three_class_classifier,
                    [(classifier, train_data.copy(), test_data.copy(),
                      os.path.join(output_dir, name), i,
                      general.class_to_num(selected_class))
                     for i in range(1, 2**len(general.absolute_values))])
                results += pool.starmap(
                    actual_three_class_classifier,
                    [(classifier, train_data.copy(), test_data.copy(),
                      os.path.join(output_dir, name), i *
                      (2**len(general.absolute_values)),
                      general.class_to_num(selected_class))
                     for i in range(1, 2**len(general.relative_values))])

                results = pandas.DataFrame(results[1:], columns=results[0])
                results["classifier"] = name
                results["combined_class"] = "-vs-".join(
                    sorted(set(data["Classification"])))
                results.to_csv(general.check_exist(
                    os.path.join(output_dir, name,
                                 "-".join(selected_class) + ".csv")),
                               index=False)

                result_data.append(results.copy())

        data["Classification"] = original_class

    pandas.concat(result_data, ignore_index=True).to_csv(general.check_exist(
        os.path.join(output_dir, "statistics.csv")),
                                                         index=False)
Ejemplo n.º 3
0
def headquarter_regressor(input_file, output_dir, watch, jobs=30):
    data = pandas.read_csv(input_file)
    data = data[[watch] + general.whole_values]
    data.rename(columns={watch: "answer"}, inplace=True)

    train_data, test_data = sklearn.model_selection.train_test_split(
        data, test_size=0.1, random_state=0)

    with multiprocessing.Pool(processes=jobs) as pool:
        for name, regressor in regressors:
            results = [("Number", "R2_score")]

            results += pool.starmap(
                actual_regressor,
                [(regressor, train_data.copy(), test_data.copy(),
                  os.path.join(output_dir, name), i)
                 for i in range(1, 2**len(general.absolute_values))])
            results += pool.starmap(
                actual_regressor,
                [(regressor, train_data.copy(), test_data.copy(),
                  os.path.join(output_dir, name), i *
                  (2**len(general.absolute_values)))
                 for i in range(1, 2**len(general.relative_values))])

            results = pandas.DataFrame(results[1:], columns=results[0])
            results["regressor"] = name
            results["feature_num"] = list(
                map(lambda x: len(general.num_to_bacteria(x)),
                    results["Number"]))
            results.to_csv(general.check_exist(
                os.path.join(output_dir, name, "statistics.csv")),
                           index=False)

    drawing_data = pandas.concat([
        pandas.read_csv(os.path.join(output_dir, name, "statistics.csv"))
        for name, regressor in regressors
    ],
                                 ignore_index=True)
    drawing_data.to_csv(general.check_exist(
        os.path.join(output_dir, "statistics.csv")),
                        index=False)

    seaborn.set(context="poster", style="whitegrid")
    fig, ax = matplotlib.pyplot.subplots(figsize=(24, 24))

    seaborn.lineplot(data=drawing_data,
                     x="feature_num",
                     y="R2_score",
                     hue="regressor",
                     ax=ax,
                     legend="full",
                     hue_order=sorted(set(drawing_data["regressor"])))

    fig.savefig(
        general.check_exist(
            os.path.join(output_dir, "Regressor_" + watch + ".png")))
    matplotlib.pyplot.close(fig)
def headquarter_ovo_classifier(input_file, output_dir, jobs):
    if not os.path.isfile(input_file):
        raise ValueError(input_file)
    elif jobs < 1:
        raise ValueError(jobs)

    data = pandas.read_csv(input_file)
    data = data[["Classification"] + general.whole_values]
    result_data = list()

    for selected_class in general.two_class_combinations:
        tmp_data = data.loc[(data["Classification"].isin(selected_class))]
        train_data, test_data = sklearn.model_selection.train_test_split(
            tmp_data,
            test_size=0.1,
            random_state=0,
            stratify=tmp_data["Classification"])

        with multiprocessing.Pool(processes=jobs) as pool:
            for name, classifier in classifiers:
                results = [
                    ("Number", ) + general.aggregate_confusion_matrix(None)
                ]

                results += pool.starmap(
                    actual_ovo_classifier,
                    [(classifier, train_data.copy(), test_data.copy(),
                      os.path.join(output_dir, name), i,
                      general.class_to_num(selected_class))
                     for i in range(1, 2**len(general.absolute_values))])
                results += pool.starmap(
                    actual_ovo_classifier,
                    [(classifier, train_data.copy(), test_data.copy(),
                      os.path.join(output_dir, name), i *
                      (2**len(general.absolute_values)),
                      general.class_to_num(selected_class))
                     for i in range(1, 2**len(general.relative_values))])

                results = pandas.DataFrame(results[1:], columns=results[0])
                results["classifier"] = name
                results["combined_class"] = "-vs-".join(
                    sorted(set(tmp_data["Classification"])))
                results.to_csv(general.check_exist(
                    os.path.join(output_dir, name,
                                 "-".join(selected_class) + ".csv")),
                               index=False)

                result_data.append(results)

    pandas.concat(result_data, ignore_index=True).to_csv(general.check_exist(
        os.path.join(output_dir, "statistics.csv")),
                                                         index=False)
Ejemplo n.º 5
0
def get_tsne(csv_file=None, tsne_file=None, random_state=0):
    if tsne_file is None:
        tsne_file = os.path.join(default_tsne_directory,
                                 "tsne_" + str(random_state) + ".csv")

    if csv_file is None:
        raise ValueError
    elif not os.path.isfile(csv_file):
        raise ValueError

    data = pandas.read_csv(csv_file)

    tsne_data = pandas.DataFrame(
        sklearn.manifold.TSNE(n_components=2,
                              random_state=random_state,
                              init="pca").fit_transform(data[list(
                                  filter(lambda x: x in tsne_columns,
                                         list(data.columns)))]),
        columns=["TSNE1", "TSNE2"])
    for column in list(tsne_data.columns):
        tsne_data[column] = scipy.stats.zscore(tsne_data[column])
    for column in ["ID", "Classification"]:
        tsne_data[column] = data[column]

    tsne_data.to_csv(general.check_exist(tsne_file), index=False)
    return tsne_file
Ejemplo n.º 6
0
def draw_violin(input_file, output_file, watch):
    data = pandas.read_csv(input_file)

    seaborn.set(context="poster", style="whitegrid")

    fig, ax = matplotlib.pyplot.subplots(figsize=(24, 24))
    seaborn.violinplot(data=data,
                       x="Classification",
                       y=watch,
                       order=general.classes)
    statannot.add_stat_annotation(ax,
                                  data=data,
                                  x="Classification",
                                  y=watch,
                                  box_pairs=[
                                      (general.classes[i - 1],
                                       general.classes[i])
                                      for i in range(1, len(general.classes))
                                  ],
                                  test="t-test_ind",
                                  text_format="star",
                                  verbose=0,
                                  order=general.classes)

    fig.savefig(general.check_exist(output_file))
    matplotlib.pyplot.close(fig)
Ejemplo n.º 7
0
def draw_extreme(csv_file, output_dir):
    if not os.path.isfile(csv_file):
        raise ValueError(csv_file)

    statistics_data = pandas.read_csv(csv_file)

    results = [("combined_class", "classifier", "bacteria", "statistics", "type", "value")]
    for combined_class in sorted(set(statistics_data["combined_class"])):
        tmp = list(filter(lambda x: "+" in x, combined_class.split("-vs-")))
        if tmp:
            combined_class_num = general.class_to_num(tmp[0].split("+"))
        else:
            combined_class_num = 0

        for classifier in sorted(set(statistics_data["classifier"])):
            prediction_directory = os.path.join(os.path.dirname(csv_file), classifier)

            for statistics_value in general.aggregate_confusion_matrix(None):
                selected_data = statistics_data.loc[(statistics_data["combined_class"] == combined_class) & (statistics_data["classifier"] == classifier)][[statistics_value, "Number"]]

                minimum, maximum = selected_data.loc[selected_data.idxmin(axis="index")[statistics_value], "Number"], selected_data.loc[selected_data.idxmax(axis="index")[statistics_value], "Number"]

                for name, value in zip(["minimum", "maximum"], [minimum, maximum]):
                    if combined_class_num:
                        prediction_data = pandas.read_csv(os.path.join(prediction_directory, "Prediction_%s_%d.csv" % (value, combined_class_num)))
                    else:
                        prediction_data = pandas.read_csv(os.path.join(prediction_directory, "Prediction_%s.csv" % (value)))
                    prediction_data = prediction_data.groupby(list(prediction_data.columns), as_index=False).size().reset_index().rename(columns={0: "counts"}).pivot("prediction", "real", "counts").fillna(0)

                    seaborn.set(context="poster", style="whitegrid")
                    fig, ax = matplotlib.pyplot.subplots(figsize=(24, 24))
                    seaborn.heatmap(prediction_data, annot=True, ax=ax, robust=True)
                    ax.set_title(combined_class.replace("-", " ") + " with " + statistics_value)
                    fig.savefig(general.check_exist(os.path.join(output_dir, name + "_" + combined_class + "_" + classifier + "_" + statistics_value + ".png")))
                    matplotlib.pyplot.close(fig)

                    results.append((combined_class, classifier, "+".join(general.num_to_bacteria(value)), statistics_value, name, value))

    pandas.DataFrame(results[1:], columns=results[0]).to_csv(general.check_exist(os.path.join(output_dir, "Min_Max.csv")), index=False)
Ejemplo n.º 8
0
def draw_statistics(csv_file, output_dir):
    if not os.path.isfile(csv_file):
        raise ValueError(csv_file)

    statistics_data = pandas.read_csv(csv_file)
    statistics_data["feature_num"] = list(map(lambda x: len(general.num_to_bacteria(x)), statistics_data["Number"]))

    for combined_class in sorted(set(statistics_data["combined_class"])):
        selected_data = statistics_data.loc[(statistics_data["combined_class"] == combined_class)]

        for statistics_value in sorted(general.aggregate_confusion_matrix(None)):
            seaborn.set(context="poster", style="whitegrid")
            fig, ax = matplotlib.pyplot.subplots(figsize=(24, 24))
            seaborn.lineplot(x="feature_num", y=statistics_value, hue="classifier", ax=ax, legend="full", data=selected_data, hue_order=sorted(set(statistics_data["classifier"])), estimator="median", ci="sd")
            ax.set_title(combined_class.replace("-", " "))
            fig.savefig(general.check_exist(os.path.join(output_dir, "Median_" + combined_class + "_" + statistics_value + ".png")))
            matplotlib.pyplot.close(fig)

            seaborn.set(context="poster", style="whitegrid")
            fig, ax = matplotlib.pyplot.subplots(figsize=(24, 24))
            seaborn.lineplot(x="feature_num", y=statistics_value, hue="classifier", ax=ax, legend="full", data=selected_data, hue_order=sorted(set(statistics_data["classifier"])))
            ax.set_title(combined_class.replace("-", " "))
            fig.savefig(general.check_exist(os.path.join(output_dir, "Mean_" + combined_class + "_" + statistics_value + ".png")))
            matplotlib.pyplot.close(fig)

            seaborn.set(context="poster", style="whitegrid")
            fig, ax = matplotlib.pyplot.subplots(figsize=(24, 24))
            seaborn.lineplot(x="feature_num", y=statistics_value, hue="classifier", ax=ax, legend="full", data=selected_data, hue_order=sorted(set(statistics_data["classifier"])), estimator=min, ci=None)
            ax.set_title(combined_class.replace("-", " "))
            fig.savefig(general.check_exist(os.path.join(output_dir, "Min_" + combined_class + "_" + statistics_value + ".png")))
            matplotlib.pyplot.close(fig)

            seaborn.set(context="poster", style="whitegrid")
            fig, ax = matplotlib.pyplot.subplots(figsize=(24, 24))
            seaborn.lineplot(x="feature_num", y=statistics_value, hue="classifier", ax=ax, legend="full", data=selected_data, hue_order=sorted(set(statistics_data["classifier"])), estimator=max, ci=None)
            ax.set_title(combined_class.replace("-", " "))
            fig.savefig(general.check_exist(os.path.join(output_dir, "Max_" + combined_class + "_" + statistics_value + ".png")))
            matplotlib.pyplot.close(fig)
Ejemplo n.º 9
0
def draw_scatter(input_file, output_file):
    seaborn.set(context="poster", style="whitegrid")

    fig, ax = matplotlib.pyplot.subplots(figsize=(24, 24))
    seaborn.scatterplot(data=pandas.read_csv(input_file),
                        x="AL",
                        y="PD",
                        hue="Classification",
                        style="Classification",
                        legend="full",
                        ax=ax,
                        hue_order=general.classes)

    fig.savefig(general.check_exist(output_file))
    matplotlib.pyplot.close(fig)
Ejemplo n.º 10
0
def draw_tsne(tsne_file=None, png_file=None):
    if png_file is None:
        png_file = os.path.join(default_tsne_directory, "tsne.png")

    if tsne_file is None:
        raise ValueError(tsne_file)
    elif not os.path.isfile(tsne_file):
        raise ValueError(tsne_file)

    seaborn.set(context="poster", style="whitegrid")

    fig, ax = matplotlib.pyplot.subplots(figsize=(24, 24))
    seaborn.scatterplot(data=pandas.read_csv(tsne_file),
                        x="TSNE1",
                        y="TSNE2",
                        hue="Classification",
                        style="Classification",
                        legend="full",
                        ax=ax)

    fig.savefig(general.check_exist(png_file))
    matplotlib.pyplot.close(fig)