Beispiel #1
0
    def test_transitive(self):
        original_cache_value = Cache.CACHE_ON
        Cache.CACHE_ON = True
        Cache.cleanup()
        self.assertFalse(
            Cache.is_cached(self.dataset, self.get_transitive_definition()))

        tracer = Tracer()
        tracer.get_metrics(self.dataset.name, self.transitive_technique_name)

        numpy_files_in_cache = list(
            filter(
                lambda f: SIMILARITY_MATRIX_EXTENSION in f,
                os.listdir(Cache.path_to_memory),
            ))

        self.assertEqual(3, len(numpy_files_in_cache))

        def create_name(name: str):
            return self.dataset.name + "_" + name + ".npy"

        self.assertIn(create_name(self.transitive_upper_comp),
                      numpy_files_in_cache)
        self.assertIn(create_name(self.transitive_component_b_name),
                      numpy_files_in_cache)
        self.assertIn(create_name(self.transitive_technique_name),
                      numpy_files_in_cache)

        Cache.cleanup(self.dataset.name)
        Cache.CACHE_ON = original_cache_value
    def test_combined_sampled(self):
        dataset = "SAMPLE_EasyClinic"
        tracer = Tracer()
        Cache.CACHE_ON = True

        metrics_a = tracer.get_metrics(
            dataset, self.combined_sampled_artifacts_technique_name)
        metrics_b = tracer.get_metrics(
            dataset, self.combined_sampled_artifacts_technique_name)

        self.assertNotEqual(metrics_a[0].ap, metrics_b[0].ap)
        self.assertNotEqual(metrics_a[0].auc, metrics_b[0].auc)

        Cache.cleanup(dataset)
    def run(self) -> Table:
        """
        Returns a metric table containing all of the metrics calculated for each technique in df
        :return: metric table with single query metrics for each technique applied to specified dataset in row
        """
        tracer = Tracer()
        metric_table = MetricTable()

        for dataset_name in DATASET_COLUMN_ORDER:
            hybrid_query_metrics: List[Metrics] = tracer.get_metrics(
                dataset_name, BEST_OVERALL_TECHNIQUE, summary_metrics=False)
            metric_table.add(
                hybrid_query_metrics,
                other={
                    DATASET_COLNAME: dataset_name,
                    TECHNIQUE_TYPE_COLNAME: HYBRID_ID,
                },
                create_index=True,
            )

            direct_query_metrics: List[Metrics] = tracer.get_metrics(
                dataset_name,
                get_best_direct_technique(dataset_name),
                summary_metrics=False,
            )
            metric_table.add(
                direct_query_metrics,
                other={
                    DATASET_COLNAME: dataset_name,
                    TECHNIQUE_TYPE_COLNAME: DIRECT_ID,
                },
                create_index=True,
            )

        individual_queries_aggregate = (metric_table.create_lag_norm_inverted(
            drop_old=True).melt_metrics(
                metric_value_col_name=METRIC_SCORE_COLNAME).sort(
                    DATASET_COLUMN_ORDER).col_values_to_upper(
                        METRIC_COLNAME).save(EXPORT_PATH))

        self.export_paths.append(PATH_TO_INDIVIDUAL_QUERIES_AGG)

        return individual_queries_aggregate
Beispiel #4
0
def calculate_technique_metric_table(dataset: str) -> Table:
    """
    Creates a metric table for each technique (direct, transitive, and combined) containing identifying information
    for each technique and the default set of accuracy metrics provided by Tracer engine.
    :param dataset: the name of the dataset
    :return: MetricTable - contains default accuracy metrics for techniques
    """
    tracer = Tracer()
    metric_table = MetricTable()

    techniques = RetrievalTechniques()
    with create_loading_bar(EXPERIMENT_LOADING_MESSAGE,
                            techniques,
                            length=len(techniques)) as techniques:
        for t_name, t_entry in techniques:
            t_entry.update({NAME_COLNAME: t_name})
            t_metrics = tracer.get_metrics(dataset, t_name)
            metric_table.add(t_metrics, t_entry)

    return metric_table
Beispiel #5
0
    def test_cleanup_deletes_on_dataset(self):
        original_cache_value = Cache.CACHE_ON
        Cache.CACHE_ON = True

        dataset_other_name = "SAMPLE_EasyClinic"
        dataset_other = Dataset(dataset_other_name)

        tracer = Tracer()
        tracer.get_metrics(dataset_other_name, self.direct_technique_name)
        tracer.get_metrics(self.dataset.name, self.direct_technique_name)

        self.assertTrue(
            Cache.is_cached(dataset_other, self.get_direct_definition()))
        self.assertTrue(
            Cache.is_cached(self.dataset, self.get_direct_definition()))

        Cache.cleanup(self.dataset.name)

        self.assertTrue(
            Cache.is_cached(dataset_other, self.get_direct_definition()))
        self.assertFalse(
            Cache.is_cached(self.dataset, self.get_direct_definition()))

        Cache.cleanup(dataset_other_name)

        self.assertFalse(
            Cache.is_cached(dataset_other, self.get_direct_definition()))
        self.assertFalse(
            Cache.is_cached(self.dataset, self.get_direct_definition()))

        Cache.CACHE_ON = original_cache_value
Beispiel #6
0
import pandas as pd
from sklearn.preprocessing import minmax_scale

from api.datasets.dataset import Dataset
from api.tracer import Tracer
from experiments.evaluate_paths import change_paths_in_technique
from utilities.constants import PATH_TO_EXPLORATORY
from utilities.technique_extractors import (
    get_best_direct_technique,
    get_best_hybrid_technique,
    get_best_transitive_technique,
)

if __name__ == "__main__":
    dataset_name = "Drone"
    tracer = Tracer()

    direct_technique = get_best_direct_technique(dataset_name)
    transitive_technique = get_best_transitive_technique(dataset_name)
    hybrid_technique = get_best_hybrid_technique(dataset_name)
    new_path = ["0", "2", "1"]
    techniques = [direct_technique, transitive_technique, hybrid_technique]
    techniques = [change_paths_in_technique(t, new_path) for t in techniques]
    matrices = [
        tracer.get_technique_data(dataset_name, t).similarity_matrix
        for t in techniques
    ]
    matrices = list(map(minmax_scale, matrices))

    def get_group(percentile):
        if percentile < 1 / 3:
import numpy as np

from api.tracer import Tracer
from utilities.technique_extractors import (
    get_best_direct_technique,
    get_best_hybrid_technique,
    get_best_transitive_technique,
)

if __name__ == "__main__":
    tracer = Tracer()
    d_name = "EasyClinic"
    direct_technique = get_best_direct_technique(d_name)
    transitive_technique = get_best_transitive_technique(d_name)
    hybrid_technique = get_best_hybrid_technique(d_name)
    """
    Direct
    """
    direct_score = tracer.get_metrics(d_name, direct_technique)[0].ap
    direct_individual_metrics = tracer.get_metrics(d_name,
                                                   direct_technique,
                                                   summary_metrics=False)
    direct_scores = [m.ap for m in direct_individual_metrics]
    print(f"Direct: {direct_score}:{np.mean(direct_scores)}")
    """
    Transitive
    """
    transitive_score = tracer.get_metrics(d_name, transitive_technique)[0].ap
    transitive_individual_metrics = tracer.get_metrics("EasyClinic",
                                                       transitive_technique,
                                                       summary_metrics=False)
from experiments import create_metric_table

if __name__ == "__main__":
    dataset_name = "Drone"

    # techniques
    direct_am = AlgebraicModel.VSM
    transitive_am = AlgebraicModel.VSM
    transitive_scaling = ScalingMethod.INDEPENDENT
    transitive_aggregation = AggregationMethod.MAX
    technique_aggregation = AggregationMethod.MAX
    trace_type = ExperimentTraceType.NONE
    n_top_intermediate_artifacts = 3

    # get technique similarity values
    tracer = Tracer()

    # direct technique
    direct_technique_name = create_metric_table.create_direct_definition(
        direct_am)
    direct_technique_data = tracer.get_technique_data(dataset_name,
                                                      direct_technique_name)
    direct_scoring_table = direct_technique_data.get_scoring_table()
    direct_scores = direct_scoring_table.values

    # transitive technique
    transitive_technique_name = create_metric_table.create_transitive_definition(
        transitive_am, transitive_scaling, transitive_aggregation, trace_type)
    transitive_technique_data: TransitiveTechniqueData = tracer.get_technique_data(
        dataset_name, transitive_technique_name)
    transitive_scoring_table = transitive_technique_data.get_scoring_table()
Beispiel #9
0
from api.datasets.dataset import Dataset
from api.tracer import Tracer
from utilities.constants import PATH_TO_DATA
from utilities.technique_extractors import get_best_transitive_technique

if __name__ == "__main__":
    good_dataset_name = "TrainController"

    EXPORT_PATH = os.path.join(PATH_TO_DATA, "presentation",
                               "similarity_distribution.csv")

    good_transitive_technique = get_best_transitive_technique(
        good_dataset_name)

    tracer = Tracer()
    technique_data = tracer.get_technique_data(good_dataset_name,
                                               good_transitive_technique)
    metrics = tracer.get_metrics(good_dataset_name,
                                 good_transitive_technique,
                                 summary_metrics=False)
    sorted_metrics = sorted(metrics, key=lambda m: m.ap)
    N_QUERIES = 5
    bad_queries = [m.query_id for m in sorted_metrics[:N_QUERIES]]
    good_queries = [m.query_id for m in sorted_metrics[-N_QUERIES:]]
    similarity_matrix = minmax_scale(technique_data.similarity_matrix)
    oracle_matrix = Dataset(good_dataset_name).traced_matrices["0-2"]

    data = pd.DataFrame()

    for g_query in good_queries:
Beispiel #10
0
    #     #  ExperimentTraceType.NONE,
    #     AlgebraicModel.VSM,
    #     ScalingMethod.INDEPENDENT,
    #     AggregationMethod.MAX,
    #     ExperimentTraceType.NONE,
    #     # AggregationMethod.SUM,
    # )
    technique_name = create_transitive_definition(
        AlgebraicModel.VSM,
        ScalingMethod.INDEPENDENT,
        AggregationMethod.MAX,
        ExperimentTraceType.NONE,
    )

    # get technique similarity values
    tracer = Tracer()
    technique_data = tracer.get_technique_data(dataset_name, technique_name)
    scoring_table = technique_data.get_scoring_table()
    scores = scoring_table.values
    precision, recall, thresholds = precision_recall_curve(
        probas_pred=scores[:, 0], y_true=scores[:, 1], pos_label=1)

    # Create result technique_data frame
    metrics_df = pd.DataFrame()
    metrics_df["precision"] = precision
    metrics_df["recall"] = recall
    metrics_df["thresholds"] = list(thresholds) + [0]
    metrics_df.to_csv("metrics.csv", index=False)

    # get precision at some level or higher
    recall_value = 0.165

if __name__ == "__main__":
    # TODO: Move this to be an experiment
    datasets = ["WARC", "EBT", "EasyClinic", "Drone", "TrainController"]
    WORD_INTERSECTION_EXPORT_PATH = os.path.join(
        PATH_TO_PRESENTATION, "word_intersection.csv"
    )
    MAX_N_WORDS = 10
    df = pd.DataFrame()
    for d_name in datasets:
        d = Dataset(d_name)
        direct_technique = get_best_direct_technique(d_name)
        transitive_technique = get_best_transitive_technique(d_name)

        tracer = Tracer()
        direct_similarity_matrix = tracer.get_technique_data(
            d_name, direct_technique
        ).similarity_matrix
        transitive_similarity_matrix = tracer.get_technique_data(
            d_name, transitive_technique
        ).similarity_matrix
        for n_intersection_words in range(MAX_N_WORDS):
            non_intersection_artifact_indices = (
                get_artifact_indices_with_word_intersection(d, n_intersection_words)
            )
            if len(non_intersection_artifact_indices) == 0:
                continue

            ranks = compare_metrics_on_queries(
                non_intersection_artifact_indices,
Beispiel #12
0
        dataset.artifacts[1].iloc[best_intermediate_artifact_indices]["id"])
    print("Most influential intermediate artifacts:",
          best_intermediate_artifact_ids)
    print("Intermediate scores:", sorted_values[:n_artifact])
    print("Intermediate Sum", sum(intermediate_values))
    print("Intermediate Max:", max(intermediate_values))


if __name__ == "__main__":
    Cache.CACHE_ON = False
    d_name = "EBT"
    direct_t_name = get_best_direct_technique(d_name)
    transitive_t_name = get_best_transitive_technique(d_name)
    hybrid_t_name = get_best_hybrid_technique(d_name)

    tracer = Tracer()
    direct_technique_data = tracer.get_technique_data(d_name, direct_t_name)
    transitive_technique_data = tracer.get_technique_data(
        d_name, transitive_t_name)
    hybrid_technique_data = tracer.get_technique_data(d_name, hybrid_t_name)
    hybrid_metrics = tracer.get_metrics(d_name,
                                        hybrid_t_name,
                                        summary_metrics=False)
    data_labels = ["direct", "transitive", "hybrid"]
    data = [
        direct_technique_data, transitive_technique_data, hybrid_technique_data
    ]
    matrices = list(map(lambda d: d.similarity_matrix, data))

    worst_query_index, ap_score = get_worst_query(hybrid_metrics)
    print("Hybrid Technique:", hybrid_t_name)
Beispiel #13
0
from api.tables.metric_table import MetricTable
from api.tracer import Tracer

if __name__ == "__main__":
    dataset_name = "EasyClinic"
    direct_technique = "(. (LSI NT) (0 2))"
    transitive_technique = "(x (PCA GLOBAL) ((. (LSI NT) (0 1)) (. (LSI NT) (1 2))))"
    hybrid_technique = f"(o (MAX) ({direct_technique} {transitive_technique}))"

    technique_definitions = [
        ("direct", direct_technique),
        ("transitive", transitive_technique),
        ("hybrid", hybrid_technique),
    ]

    metric_table = MetricTable()
    tracer = Tracer()

    for t_name, t_def in technique_definitions:
        t_metrics = tracer.get_metrics(dataset_name, t_def)
        metric_table.add(t_metrics, {"name": t_name})

    print(metric_table.table)
Beispiel #14
0
DATASET_NAME = "IllustrativeExample"
TOP_TECHNIQUE_NAME = "(. (VSM NT) (0 1))"
BOTTOM_TECHNIQUE_NAME = "(. (VSM NT) (1 2))"
DIRECT_TECHNIQUE_NAME = "(. (VSM NT) (0 2))"
TECHNIQUE_NAME = "(x (MAX INDEPENDENT) ((. (VSM NT) (0 1)) (. (VSM NT) (1 2))))"
REBUILD = False

if __name__ == "__main__":
    if REBUILD:
        dataset_builder = DatasetBuilder(DATASET_NAME)
        dataset_builder.build()
        dataset_builder.export()

    dataset = Dataset(DATASET_NAME)

    tracer = Tracer()
    top_technique_data = tracer.get_technique_data(DATASET_NAME,
                                                   TOP_TECHNIQUE_NAME)
    bottom_technique_data = tracer.get_technique_data(DATASET_NAME,
                                                      BOTTOM_TECHNIQUE_NAME)
    direct_technique_data = tracer.get_technique_data(DATASET_NAME,
                                                      DIRECT_TECHNIQUE_NAME)

    top_score = top_technique_data.similarity_matrix[0][0]
    bottom_score = bottom_technique_data.similarity_matrix[0][0]
    transitive_score = top_score * bottom_score
    direct_score = direct_technique_data.similarity_matrix[0][0]

    print("TOP:", top_score)
    print("BOTTOM:", bottom_score)
    print("TRANSITIVE:", transitive_score)
    def run(self) -> Table:
        """
        Returns a metric table containing all of the metrics calculated for each technique in df
        :return: metric table with single query metrics for each technique applied to specified dataset in row
        """
        dataset_name = prompt_for_dataset()

        """
        Find best techniques
        """
        direct_best_definition = get_best_direct_technique(dataset_name)
        transitive_best_definition = get_best_transitive_technique(dataset_name)
        combined_best_definition = get_best_hybrid_technique(dataset_name)

        """
        Calculate metrics for individual queries on dataset
        """
        tracer = Tracer()
        metric_table = MetricTable()

        direct_metrics: [Metrics] = tracer.get_metrics(
            dataset_name, direct_best_definition, summary_metrics=False
        )
        metric_table.add(
            direct_metrics, other={TECHNIQUE_TYPE_COLNAME: DIRECT_ID}, create_index=True
        )

        transitive_metrics: [Metrics] = tracer.get_metrics(
            dataset_name, transitive_best_definition, summary_metrics=False
        )
        metric_table.add(
            transitive_metrics,
            other={TECHNIQUE_TYPE_COLNAME: TRANSITIVE_ID},
            create_index=True,
        )

        combined_metrics: [Metrics] = tracer.get_metrics(
            dataset_name, combined_best_definition, summary_metrics=False
        )
        metric_table.add(
            combined_metrics,
            other={TECHNIQUE_TYPE_COLNAME: HYBRID_ID},
            create_index=True,
        )

        """
        Export individual run
        """
        export_path = os.path.join(PATH_TO_INDIVIDUAL_QUERIES, dataset_name + ".csv")
        (metric_table.sort(DATASET_COLUMN_ORDER).save(export_path))
        self.export_paths.append(export_path)

        """
        Update aggregate
        """

        individual_queries_aggregate = (
            MetricTable(
                Table.aggregate_intermediate_files(PATH_TO_INDIVIDUAL_QUERIES).table
            )
            .create_lag_norm_inverted(drop_old=True)
            .melt_metrics(metric_value_col_name=METRIC_SCORE_COLNAME)
            .sort(DATASET_COLUMN_ORDER)
            .col_values_to_upper(METRIC_COLNAME)
            .to_title_case(exclude=METRIC_COLNAME)
            .save(PATH_TO_INDIVIDUAL_QUERIES_AGG)
        )

        individual_queries_aggregate = (
            MetricTable(
                Table.aggregate_intermediate_files(PATH_TO_INDIVIDUAL_QUERIES).table
            )
            .create_lag_norm_inverted(drop_old=True)
            .sort(DATASET_COLUMN_ORDER)
            .save(PATH_TO_INDIVIDUAL_QUERIES_UNMELTED)
        )

        # aggregate_table
        self.export_paths.append(PATH_TO_INDIVIDUAL_QUERIES_AGG)

        return individual_queries_aggregate
Beispiel #16
0
    def run(self) -> Table:
        tracer = Tracer()

        def get_metrics(d_name, t_def: str):
            return tracer.get_metrics(d_name, t_def)

        def add_metrics(d_name, t_def: str, t_type: str, p_name: str):
            t_metrics = get_metrics(d_name, t_def)
            metric_table.add(
                t_metrics,
                {
                    DATASET_COLNAME: d_name,
                    "path": p_name,
                    "type": t_type,
                    NAME_COLNAME: t_def,
                },
            )

        aggregate_gain = None
        aggregate_metric = None
        for path in POSSIBLE_PATHS:
            metric_table = MetricTable()
            comparison_dict = {}
            path_name = path_to_str(path)

            for dataset_name in DATASET_COLUMN_ORDER:
                source_index = str(path[0])
                intermediate_index = str(path[1])
                target_index = str(path[2])

                new_path = [source_index, intermediate_index, target_index]

                # direct
                direct_technique_def = change_paths_in_technique(
                    get_best_direct_technique(dataset_name), new_path)
                add_metrics(
                    dataset_name,
                    direct_technique_def,
                    DIRECT_ID,
                    path_name,
                )

                # transitive
                transitive_technique_def = change_paths_in_technique(
                    get_best_transitive_technique(dataset_name), new_path)
                add_metrics(
                    dataset_name,
                    transitive_technique_def,
                    TRANSITIVE_ID,
                    path_name,
                )

                # HYBRID
                hybrid_technique_definition = change_paths_in_technique(
                    get_best_hybrid_technique(dataset_name), new_path)
                add_metrics(
                    dataset_name,
                    hybrid_technique_definition,
                    HYBRID_ID,
                    path_name,
                )
                comparison_dict.update({
                    dataset_name:
                    (direct_technique_def, hybrid_technique_definition)
                })
            gain_table = metric_table.calculate_gain_between_techniques(
                comparison_dict)
            gain_table.table["path"] = path_name

            aggregate_gain = (gain_table.table if aggregate_gain is None else
                              pd.concat([gain_table.table, aggregate_gain]))

            aggregate_metric = (metric_table.table
                                if aggregate_metric is None else pd.concat(
                                    [metric_table.table, aggregate_metric]))

            MetricTable(aggregate_metric).create_lag_norm_inverted(
                drop_old=True).melt_metrics().save(METRIC_TABLE_EXPORT_PATH)
            self.export_paths.append(METRIC_TABLE_EXPORT_PATH)

            MetricTable(aggregate_gain).melt_metrics().save(
                GAIN_TABLE_EXPORT_PATH)
            self.export_paths.append(GAIN_TABLE_EXPORT_PATH)
        return aggregate_gain