예제 #1
0
    def test_cleanup_deletes_on_dataset(self):
        original_cache_value = Cache.CACHE_ON
        Cache.CACHE_ON = True

        dataset_other_name = "SAMPLE_EasyClinic"
        dataset_other = Dataset(dataset_other_name)

        tracer = Tracer()
        tracer.get_metrics(dataset_other_name, self.direct_technique_name)
        tracer.get_metrics(self.dataset.name, self.direct_technique_name)

        self.assertTrue(
            Cache.is_cached(dataset_other, self.get_direct_definition()))
        self.assertTrue(
            Cache.is_cached(self.dataset, self.get_direct_definition()))

        Cache.cleanup(self.dataset.name)

        self.assertTrue(
            Cache.is_cached(dataset_other, self.get_direct_definition()))
        self.assertFalse(
            Cache.is_cached(self.dataset, self.get_direct_definition()))

        Cache.cleanup(dataset_other_name)

        self.assertFalse(
            Cache.is_cached(dataset_other, self.get_direct_definition()))
        self.assertFalse(
            Cache.is_cached(self.dataset, self.get_direct_definition()))

        Cache.CACHE_ON = original_cache_value
예제 #2
0
    def test_transitive(self):
        original_cache_value = Cache.CACHE_ON
        Cache.CACHE_ON = True
        Cache.cleanup()
        self.assertFalse(
            Cache.is_cached(self.dataset, self.get_transitive_definition()))

        tracer = Tracer()
        tracer.get_metrics(self.dataset.name, self.transitive_technique_name)

        numpy_files_in_cache = list(
            filter(
                lambda f: SIMILARITY_MATRIX_EXTENSION in f,
                os.listdir(Cache.path_to_memory),
            ))

        self.assertEqual(3, len(numpy_files_in_cache))

        def create_name(name: str):
            return self.dataset.name + "_" + name + ".npy"

        self.assertIn(create_name(self.transitive_upper_comp),
                      numpy_files_in_cache)
        self.assertIn(create_name(self.transitive_component_b_name),
                      numpy_files_in_cache)
        self.assertIn(create_name(self.transitive_technique_name),
                      numpy_files_in_cache)

        Cache.cleanup(self.dataset.name)
        Cache.CACHE_ON = original_cache_value
    def test_combined_sampled(self):
        dataset = "SAMPLE_EasyClinic"
        tracer = Tracer()
        Cache.CACHE_ON = True

        metrics_a = tracer.get_metrics(
            dataset, self.combined_sampled_artifacts_technique_name)
        metrics_b = tracer.get_metrics(
            dataset, self.combined_sampled_artifacts_technique_name)

        self.assertNotEqual(metrics_a[0].ap, metrics_b[0].ap)
        self.assertNotEqual(metrics_a[0].auc, metrics_b[0].auc)

        Cache.cleanup(dataset)
    def run(self) -> Table:
        """
        Returns a metric table containing all of the metrics calculated for each technique in df
        :return: metric table with single query metrics for each technique applied to specified dataset in row
        """
        tracer = Tracer()
        metric_table = MetricTable()

        for dataset_name in DATASET_COLUMN_ORDER:
            hybrid_query_metrics: List[Metrics] = tracer.get_metrics(
                dataset_name, BEST_OVERALL_TECHNIQUE, summary_metrics=False)
            metric_table.add(
                hybrid_query_metrics,
                other={
                    DATASET_COLNAME: dataset_name,
                    TECHNIQUE_TYPE_COLNAME: HYBRID_ID,
                },
                create_index=True,
            )

            direct_query_metrics: List[Metrics] = tracer.get_metrics(
                dataset_name,
                get_best_direct_technique(dataset_name),
                summary_metrics=False,
            )
            metric_table.add(
                direct_query_metrics,
                other={
                    DATASET_COLNAME: dataset_name,
                    TECHNIQUE_TYPE_COLNAME: DIRECT_ID,
                },
                create_index=True,
            )

        individual_queries_aggregate = (metric_table.create_lag_norm_inverted(
            drop_old=True).melt_metrics(
                metric_value_col_name=METRIC_SCORE_COLNAME).sort(
                    DATASET_COLUMN_ORDER).col_values_to_upper(
                        METRIC_COLNAME).save(EXPORT_PATH))

        self.export_paths.append(PATH_TO_INDIVIDUAL_QUERIES_AGG)

        return individual_queries_aggregate
예제 #5
0
def calculate_technique_metric_table(dataset: str) -> Table:
    """
    Creates a metric table for each technique (direct, transitive, and combined) containing identifying information
    for each technique and the default set of accuracy metrics provided by Tracer engine.
    :param dataset: the name of the dataset
    :return: MetricTable - contains default accuracy metrics for techniques
    """
    tracer = Tracer()
    metric_table = MetricTable()

    techniques = RetrievalTechniques()
    with create_loading_bar(EXPERIMENT_LOADING_MESSAGE,
                            techniques,
                            length=len(techniques)) as techniques:
        for t_name, t_entry in techniques:
            t_entry.update({NAME_COLNAME: t_name})
            t_metrics = tracer.get_metrics(dataset, t_name)
            metric_table.add(t_metrics, t_entry)

    return metric_table
from utilities.technique_extractors import (
    get_best_direct_technique,
    get_best_hybrid_technique,
    get_best_transitive_technique,
)

if __name__ == "__main__":
    tracer = Tracer()
    d_name = "EasyClinic"
    direct_technique = get_best_direct_technique(d_name)
    transitive_technique = get_best_transitive_technique(d_name)
    hybrid_technique = get_best_hybrid_technique(d_name)
    """
    Direct
    """
    direct_score = tracer.get_metrics(d_name, direct_technique)[0].ap
    direct_individual_metrics = tracer.get_metrics(d_name,
                                                   direct_technique,
                                                   summary_metrics=False)
    direct_scores = [m.ap for m in direct_individual_metrics]
    print(f"Direct: {direct_score}:{np.mean(direct_scores)}")
    """
    Transitive
    """
    transitive_score = tracer.get_metrics(d_name, transitive_technique)[0].ap
    transitive_individual_metrics = tracer.get_metrics("EasyClinic",
                                                       transitive_technique,
                                                       summary_metrics=False)
    transitive_scores = [m.ap for m in transitive_individual_metrics]
    print(f"Transitive: {transitive_score}:{np.mean(transitive_score)}")
    """
예제 #7
0
from utilities.technique_extractors import get_best_transitive_technique

if __name__ == "__main__":
    good_dataset_name = "TrainController"

    EXPORT_PATH = os.path.join(PATH_TO_DATA, "presentation",
                               "similarity_distribution.csv")

    good_transitive_technique = get_best_transitive_technique(
        good_dataset_name)

    tracer = Tracer()
    technique_data = tracer.get_technique_data(good_dataset_name,
                                               good_transitive_technique)
    metrics = tracer.get_metrics(good_dataset_name,
                                 good_transitive_technique,
                                 summary_metrics=False)
    sorted_metrics = sorted(metrics, key=lambda m: m.ap)
    N_QUERIES = 5
    bad_queries = [m.query_id for m in sorted_metrics[:N_QUERIES]]
    good_queries = [m.query_id for m in sorted_metrics[-N_QUERIES:]]
    similarity_matrix = minmax_scale(technique_data.similarity_matrix)
    oracle_matrix = Dataset(good_dataset_name).traced_matrices["0-2"]

    data = pd.DataFrame()

    for g_query in good_queries:
        for col_index in range(similarity_matrix.shape[1]):
            score_value = similarity_matrix[g_query][col_index]
            oracle_value = oracle_matrix[g_query][col_index]
            delta_value = score_value - oracle_value
예제 #8
0

if __name__ == "__main__":
    Cache.CACHE_ON = False
    d_name = "EBT"
    direct_t_name = get_best_direct_technique(d_name)
    transitive_t_name = get_best_transitive_technique(d_name)
    hybrid_t_name = get_best_hybrid_technique(d_name)

    tracer = Tracer()
    direct_technique_data = tracer.get_technique_data(d_name, direct_t_name)
    transitive_technique_data = tracer.get_technique_data(
        d_name, transitive_t_name)
    hybrid_technique_data = tracer.get_technique_data(d_name, hybrid_t_name)
    hybrid_metrics = tracer.get_metrics(d_name,
                                        hybrid_t_name,
                                        summary_metrics=False)
    data_labels = ["direct", "transitive", "hybrid"]
    data = [
        direct_technique_data, transitive_technique_data, hybrid_technique_data
    ]
    matrices = list(map(lambda d: d.similarity_matrix, data))

    worst_query_index, ap_score = get_worst_query(hybrid_metrics)
    print("Hybrid Technique:", hybrid_t_name)
    print("Hybrid AP on worst query:", ap_score)
    """
    Experiments
    """
    print()
    print_trace_link_ranks_per_technique(d_name, matrices, data_labels,
예제 #9
0
from api.tables.metric_table import MetricTable
from api.tracer import Tracer

if __name__ == "__main__":
    dataset_name = "EasyClinic"
    direct_technique = "(. (LSI NT) (0 2))"
    transitive_technique = "(x (PCA GLOBAL) ((. (LSI NT) (0 1)) (. (LSI NT) (1 2))))"
    hybrid_technique = f"(o (MAX) ({direct_technique} {transitive_technique}))"

    technique_definitions = [
        ("direct", direct_technique),
        ("transitive", transitive_technique),
        ("hybrid", hybrid_technique),
    ]

    metric_table = MetricTable()
    tracer = Tracer()

    for t_name, t_def in technique_definitions:
        t_metrics = tracer.get_metrics(dataset_name, t_def)
        metric_table.add(t_metrics, {"name": t_name})

    print(metric_table.table)
    def run(self) -> Table:
        """
        Returns a metric table containing all of the metrics calculated for each technique in df
        :return: metric table with single query metrics for each technique applied to specified dataset in row
        """
        dataset_name = prompt_for_dataset()

        """
        Find best techniques
        """
        direct_best_definition = get_best_direct_technique(dataset_name)
        transitive_best_definition = get_best_transitive_technique(dataset_name)
        combined_best_definition = get_best_hybrid_technique(dataset_name)

        """
        Calculate metrics for individual queries on dataset
        """
        tracer = Tracer()
        metric_table = MetricTable()

        direct_metrics: [Metrics] = tracer.get_metrics(
            dataset_name, direct_best_definition, summary_metrics=False
        )
        metric_table.add(
            direct_metrics, other={TECHNIQUE_TYPE_COLNAME: DIRECT_ID}, create_index=True
        )

        transitive_metrics: [Metrics] = tracer.get_metrics(
            dataset_name, transitive_best_definition, summary_metrics=False
        )
        metric_table.add(
            transitive_metrics,
            other={TECHNIQUE_TYPE_COLNAME: TRANSITIVE_ID},
            create_index=True,
        )

        combined_metrics: [Metrics] = tracer.get_metrics(
            dataset_name, combined_best_definition, summary_metrics=False
        )
        metric_table.add(
            combined_metrics,
            other={TECHNIQUE_TYPE_COLNAME: HYBRID_ID},
            create_index=True,
        )

        """
        Export individual run
        """
        export_path = os.path.join(PATH_TO_INDIVIDUAL_QUERIES, dataset_name + ".csv")
        (metric_table.sort(DATASET_COLUMN_ORDER).save(export_path))
        self.export_paths.append(export_path)

        """
        Update aggregate
        """

        individual_queries_aggregate = (
            MetricTable(
                Table.aggregate_intermediate_files(PATH_TO_INDIVIDUAL_QUERIES).table
            )
            .create_lag_norm_inverted(drop_old=True)
            .melt_metrics(metric_value_col_name=METRIC_SCORE_COLNAME)
            .sort(DATASET_COLUMN_ORDER)
            .col_values_to_upper(METRIC_COLNAME)
            .to_title_case(exclude=METRIC_COLNAME)
            .save(PATH_TO_INDIVIDUAL_QUERIES_AGG)
        )

        individual_queries_aggregate = (
            MetricTable(
                Table.aggregate_intermediate_files(PATH_TO_INDIVIDUAL_QUERIES).table
            )
            .create_lag_norm_inverted(drop_old=True)
            .sort(DATASET_COLUMN_ORDER)
            .save(PATH_TO_INDIVIDUAL_QUERIES_UNMELTED)
        )

        # aggregate_table
        self.export_paths.append(PATH_TO_INDIVIDUAL_QUERIES_AGG)

        return individual_queries_aggregate