Ejemplo n.º 1
0
def save_metrics(results):
    dataframe = pd.DataFrame.from_dict([results])
    existing_logs = file.get_eval_logs()
    if existing_logs is not None:
        dataframe = pd.concat([existing_logs, dataframe])
    file.save_eval_logs(dataframe)
    file.save_result_log(results)
Ejemplo n.º 2
0
def remove_wrongs(edges):
    for dataset in edges.keys():
        counts = edges[dataset]
        max_nonzero = len(counts) - 2
        results_log = file.get_eval_logs(dataset=dataset)
        indices = results_log[results_log["threshold"] > max_nonzero].index
        results_log.loc[indices, 'wiki_enabled'] = False
        file.save_eval_logs(results_log, dataset=dataset)
Ejemplo n.º 3
0
def count_model_runs(dataset):
    results = file.get_eval_logs(dataset=dataset)

    count_dict = {}

    for index, row in results.iterrows():
        if not row["wiki_enabled"]:
            name = f"{row['wiki_enabled']}:0:empty:0"
        else:
            name = f"{row['wiki_enabled']}:{row['window_size']}:{row['raw_count']}:{row['threshold']}"
        if name in count_dict:
            count_dict[name] += 1
        else:
            count_dict[name] = 1

    counts = []
    for key in count_dict:
        counts.append(count_dict[key])

    file.save_result_log_counts(count_dict, dataset)
    return count_dict
Ejemplo n.º 4
0
def perform_ttest(dataset, count_dict):
    desired_p_val = 0.05
    results_log = file.get_eval_logs(dataset=dataset)
    baseline = results_log[results_log["wiki_enabled"] == False].nlargest(
        10, columns="accuracy")
    base_accuracies = baseline["accuracy"].tolist()
    t_dict = {}
    for key in count_dict.keys():
        value = count_dict[key]
        params = key.split(":")
        wiki_enabled = params[0] == "True"
        edge_type = str(params[2])
        threshold = int(params[3])

        if wiki_enabled and value >= 10:
            test = results_log[(results_log["raw_count"] == edge_type)
                               & (results_log["threshold"] == threshold) &
                               (results_log["wiki_enabled"] == True)]
            test_accuracies = test["accuracy"].tolist()
            assert len(base_accuracies) == len(
                test_accuracies
            ), f"{len(base_accuracies)} != {len(test_accuracies)}"
            # Independent-samples t tests compare scores on the same variable but for two different groups of cases
            t_stat_ind, p_val_ind = stats.ttest_ind(test_accuracies,
                                                    base_accuracies)
            # Paired t-tests compare scores on two different variables but for the same group of cases
            t_stat_rel, p_val_rel = stats.ttest_rel(test_accuracies,
                                                    base_accuracies)
            t_dict[f"{edge_type}:{threshold}"] = {
                "ind":
                [p_val_ind, "True" if p_val_ind < desired_p_val else "False"],
                "rel":
                [p_val_rel, "True" if p_val_rel < desired_p_val else "False"]
            }
            io.write_json(f"{io.get_latex_path(dataset)}/{dataset}_ttest.json",
                          t_dict)
Ejemplo n.º 5
0
def get_base_lowest(dataset, n=5):
    results_log = file.get_eval_logs(dataset=dataset)
    minimum = results_log[results_log["wiki_enabled"] == False].nsmallest(n, columns="accuracy")
    maximum = pd.DataFrame()
    return maximum, minimum
Ejemplo n.º 6
0
def get_max_min_values(dataset, type, n=5):
    results_log = file.get_eval_logs(dataset=dataset)
    results_log = results_log[results_log["raw_count"] == type]
    maximum = results_log.nlargest(n, columns="accuracy")
    minimum = results_log.nsmallest(n, columns="accuracy")
    return maximum, minimum
Ejemplo n.º 7
0
def plot_metric(dataset, metric="accuracy"):
    results = file.get_eval_logs(dataset=dataset)
    results_filtered = file.get_eval_logs(dataset=dataset, version="filtered")
    base = results_filtered[(results_filtered["wiki_enabled"] == False)
                            & (results_filtered["window_size"] == 15)][metric]
    base_mean = base.mean()
    base_std = base.std()

    results = results[results["wiki_enabled"] == True]

    if FLAGS.version == "unfiltered":
        base_results = file.get_eval_logs(dataset=dataset, version="filtered")
        base_results = base_results[base_results["wiki_enabled"] == False]
        base_mean = base_results[metric].mean()

    order = [
        "count", "count_norm", "count_norm_pmi", "idf", "idf_norm",
        "idf_norm_pmi", "idf_wiki", "idf_wiki_norm", "idf_wiki_norm_pmi"
    ]

    g = sns.FacetGrid(data=results,
                      col="raw_count",
                      col_wrap=3,
                      col_order=order,
                      sharex=False,
                      sharey=True)
    g.map(sns.lineplot,
          "threshold",
          metric,
          ci="sd",
          err_style="bars",
          markers=True,
          dashes=False,
          color="black")
    g.set_titles(row_template='{row_name}', col_template='{col_name}')
    max_threshold = results["threshold"].max() + 1
    if dataset == "ohsumed":
        max_threshold = 17
    g.fig.set_figwidth(15)
    g.set_axis_labels("Minimum Relation Count Threshold", "Accuracy")

    color = "black"
    for x in range(0, len(g.axes)):
        ax = g.axes[x]
        title = ax.get_title().title().replace("_", "-").replace(
            "Idf", "TF-IDF").replace("Pmi", "PMI")
        ax.set_title(title)
        ax.set_xticks(range(1, max_threshold))
        # ax.text(x=1, y=base_mean, s='textGCN average', alpha=0.7, color=color)
        ax.axhline(y=base_mean,
                   color=color,
                   linewidth=1.5,
                   alpha=.3,
                   ls="--",
                   label="textGCN baseline")
        # ax.axhline(y=base_mean + base_std, color=color, linewidth=1, alpha=.3, ls="--")
        # ax.axhline(y=base_mean - base_std, color=color, linewidth=1, alpha=.3, ls="--")

    g.savefig(
        f"{io.get_basic_plots_path(dataset)}/{dataset}_{metric}_{FLAGS.version}.png"
    )
Ejemplo n.º 8
0
def get_results_statistics(dataset, metric="accuracy"):
    results_log = file.get_eval_logs(dataset=dataset)
    thresholds = set(
        results_log[results_log["wiki_enabled"]]["threshold"].tolist())
    types = [
        "count", "count_norm", "count_norm_pmi", "idf", "idf_norm",
        "idf_norm_pmi", "idf_wiki", "idf_wiki_norm", "idf_wiki_norm_pmi"
    ]
    t_vals = io.read_json(f"{io.get_latex_path(dataset)}/{dataset}_ttest.json")

    max_mean = 0
    max_key = ""
    for t in thresholds:
        for r in types:
            mean = results_log[(results_log["wiki_enabled"] == True)
                               & (results_log["window_size"] == 15) &
                               (results_log["threshold"] == t) &
                               (results_log["raw_count"] == r)][metric].mean()
            if mean > max_mean:
                max_mean = mean
                max_key = f"{r.lower().replace('-', '_')}:{t}"

    results_dict = {}
    for t in thresholds:
        averages = []
        for r in types:
            t_results = results_log[(results_log["wiki_enabled"] == True)
                                    & (results_log["window_size"] == 15) &
                                    (results_log["threshold"] == t) &
                                    (results_log["raw_count"] == r)][metric]
            average = "%.4f" % round(t_results.mean(), 4)

            std_dev = "%.4f" % round(t_results.std(), 4)
            key = f"{r.lower().replace('-', '_')}:{t}"
            if key in t_vals:
                is_significant = t_vals[key]["rel"][1] == "True"
            else:
                is_significant = True
            is_max = key == max_key
            if is_max:
                averages.append("$\mathbf{" + average + " \pm " + std_dev +
                                f"{'' if is_significant else '^*'}" + "}$")
            else:
                averages.append("$" + average + " \pm " + std_dev +
                                f"{'' if is_significant else '^*'}" + "$")
        results_dict[t] = averages

    rows = []
    for threshold in results_dict.keys():
        tmp = []
        tmp.append(threshold)
        [tmp.append(value) for value in results_dict[threshold]]
        rows.append(tmp)

    filtered_results = file.get_eval_logs(dataset=dataset, version="filtered")
    base_avg = "%.4f" % round(
        filtered_results[filtered_results["wiki_enabled"] == False]
        [metric].mean(), 4)
    base_std = "%.4f" % round(
        filtered_results[filtered_results["wiki_enabled"] == False]
        [metric].std(), 4)

    base = ["textKGCN (none)"]
    for x in range(0, 9):
        base.append(f"${base_avg} \pm {base_std}$")
    rows.append(base)

    header = ["Threshold"]
    [
        header.append(t.title().replace("_",
                                        "-").replace("Idf", "TF-IDF").replace(
                                            "Pmi", "PMI")) for t in types
    ]

    get_latex_code(
        header, rows, "c|ccc|ccc|ccc", f"{dataset}_{metric}_table.txt",
        dataset, f"Classification accuracy {dataset.upper()} dataset",
        f"Text classification accuracy of the {dataset.upper()} dataset for different thresholds and edge types. "
        +
        "Values marked with $^*$ did not outperform \emph{textKGCN (none)} significantly based on student t-test (p < 0.05)."
    )