def test_cleanup_deletes_on_dataset(self): original_cache_value = Cache.CACHE_ON Cache.CACHE_ON = True dataset_other_name = "SAMPLE_EasyClinic" dataset_other = Dataset(dataset_other_name) tracer = Tracer() tracer.get_metrics(dataset_other_name, self.direct_technique_name) tracer.get_metrics(self.dataset.name, self.direct_technique_name) self.assertTrue( Cache.is_cached(dataset_other, self.get_direct_definition())) self.assertTrue( Cache.is_cached(self.dataset, self.get_direct_definition())) Cache.cleanup(self.dataset.name) self.assertTrue( Cache.is_cached(dataset_other, self.get_direct_definition())) self.assertFalse( Cache.is_cached(self.dataset, self.get_direct_definition())) Cache.cleanup(dataset_other_name) self.assertFalse( Cache.is_cached(dataset_other, self.get_direct_definition())) self.assertFalse( Cache.is_cached(self.dataset, self.get_direct_definition())) Cache.CACHE_ON = original_cache_value
def test_transitive(self): original_cache_value = Cache.CACHE_ON Cache.CACHE_ON = True Cache.cleanup() self.assertFalse( Cache.is_cached(self.dataset, self.get_transitive_definition())) tracer = Tracer() tracer.get_metrics(self.dataset.name, self.transitive_technique_name) numpy_files_in_cache = list( filter( lambda f: SIMILARITY_MATRIX_EXTENSION in f, os.listdir(Cache.path_to_memory), )) self.assertEqual(3, len(numpy_files_in_cache)) def create_name(name: str): return self.dataset.name + "_" + name + ".npy" self.assertIn(create_name(self.transitive_upper_comp), numpy_files_in_cache) self.assertIn(create_name(self.transitive_component_b_name), numpy_files_in_cache) self.assertIn(create_name(self.transitive_technique_name), numpy_files_in_cache) Cache.cleanup(self.dataset.name) Cache.CACHE_ON = original_cache_value
def test_combined_sampled(self): dataset = "SAMPLE_EasyClinic" tracer = Tracer() Cache.CACHE_ON = True metrics_a = tracer.get_metrics( dataset, self.combined_sampled_artifacts_technique_name) metrics_b = tracer.get_metrics( dataset, self.combined_sampled_artifacts_technique_name) self.assertNotEqual(metrics_a[0].ap, metrics_b[0].ap) self.assertNotEqual(metrics_a[0].auc, metrics_b[0].auc) Cache.cleanup(dataset)
def run(self) -> Table: """ Returns a metric table containing all of the metrics calculated for each technique in df :return: metric table with single query metrics for each technique applied to specified dataset in row """ tracer = Tracer() metric_table = MetricTable() for dataset_name in DATASET_COLUMN_ORDER: hybrid_query_metrics: List[Metrics] = tracer.get_metrics( dataset_name, BEST_OVERALL_TECHNIQUE, summary_metrics=False) metric_table.add( hybrid_query_metrics, other={ DATASET_COLNAME: dataset_name, TECHNIQUE_TYPE_COLNAME: HYBRID_ID, }, create_index=True, ) direct_query_metrics: List[Metrics] = tracer.get_metrics( dataset_name, get_best_direct_technique(dataset_name), summary_metrics=False, ) metric_table.add( direct_query_metrics, other={ DATASET_COLNAME: dataset_name, TECHNIQUE_TYPE_COLNAME: DIRECT_ID, }, create_index=True, ) individual_queries_aggregate = (metric_table.create_lag_norm_inverted( drop_old=True).melt_metrics( metric_value_col_name=METRIC_SCORE_COLNAME).sort( DATASET_COLUMN_ORDER).col_values_to_upper( METRIC_COLNAME).save(EXPORT_PATH)) self.export_paths.append(PATH_TO_INDIVIDUAL_QUERIES_AGG) return individual_queries_aggregate
def calculate_technique_metric_table(dataset: str) -> Table: """ Creates a metric table for each technique (direct, transitive, and combined) containing identifying information for each technique and the default set of accuracy metrics provided by Tracer engine. :param dataset: the name of the dataset :return: MetricTable - contains default accuracy metrics for techniques """ tracer = Tracer() metric_table = MetricTable() techniques = RetrievalTechniques() with create_loading_bar(EXPERIMENT_LOADING_MESSAGE, techniques, length=len(techniques)) as techniques: for t_name, t_entry in techniques: t_entry.update({NAME_COLNAME: t_name}) t_metrics = tracer.get_metrics(dataset, t_name) metric_table.add(t_metrics, t_entry) return metric_table
from utilities.technique_extractors import ( get_best_direct_technique, get_best_hybrid_technique, get_best_transitive_technique, ) if __name__ == "__main__": tracer = Tracer() d_name = "EasyClinic" direct_technique = get_best_direct_technique(d_name) transitive_technique = get_best_transitive_technique(d_name) hybrid_technique = get_best_hybrid_technique(d_name) """ Direct """ direct_score = tracer.get_metrics(d_name, direct_technique)[0].ap direct_individual_metrics = tracer.get_metrics(d_name, direct_technique, summary_metrics=False) direct_scores = [m.ap for m in direct_individual_metrics] print(f"Direct: {direct_score}:{np.mean(direct_scores)}") """ Transitive """ transitive_score = tracer.get_metrics(d_name, transitive_technique)[0].ap transitive_individual_metrics = tracer.get_metrics("EasyClinic", transitive_technique, summary_metrics=False) transitive_scores = [m.ap for m in transitive_individual_metrics] print(f"Transitive: {transitive_score}:{np.mean(transitive_score)}") """
from utilities.technique_extractors import get_best_transitive_technique if __name__ == "__main__": good_dataset_name = "TrainController" EXPORT_PATH = os.path.join(PATH_TO_DATA, "presentation", "similarity_distribution.csv") good_transitive_technique = get_best_transitive_technique( good_dataset_name) tracer = Tracer() technique_data = tracer.get_technique_data(good_dataset_name, good_transitive_technique) metrics = tracer.get_metrics(good_dataset_name, good_transitive_technique, summary_metrics=False) sorted_metrics = sorted(metrics, key=lambda m: m.ap) N_QUERIES = 5 bad_queries = [m.query_id for m in sorted_metrics[:N_QUERIES]] good_queries = [m.query_id for m in sorted_metrics[-N_QUERIES:]] similarity_matrix = minmax_scale(technique_data.similarity_matrix) oracle_matrix = Dataset(good_dataset_name).traced_matrices["0-2"] data = pd.DataFrame() for g_query in good_queries: for col_index in range(similarity_matrix.shape[1]): score_value = similarity_matrix[g_query][col_index] oracle_value = oracle_matrix[g_query][col_index] delta_value = score_value - oracle_value
if __name__ == "__main__": Cache.CACHE_ON = False d_name = "EBT" direct_t_name = get_best_direct_technique(d_name) transitive_t_name = get_best_transitive_technique(d_name) hybrid_t_name = get_best_hybrid_technique(d_name) tracer = Tracer() direct_technique_data = tracer.get_technique_data(d_name, direct_t_name) transitive_technique_data = tracer.get_technique_data( d_name, transitive_t_name) hybrid_technique_data = tracer.get_technique_data(d_name, hybrid_t_name) hybrid_metrics = tracer.get_metrics(d_name, hybrid_t_name, summary_metrics=False) data_labels = ["direct", "transitive", "hybrid"] data = [ direct_technique_data, transitive_technique_data, hybrid_technique_data ] matrices = list(map(lambda d: d.similarity_matrix, data)) worst_query_index, ap_score = get_worst_query(hybrid_metrics) print("Hybrid Technique:", hybrid_t_name) print("Hybrid AP on worst query:", ap_score) """ Experiments """ print() print_trace_link_ranks_per_technique(d_name, matrices, data_labels,
from api.tables.metric_table import MetricTable from api.tracer import Tracer if __name__ == "__main__": dataset_name = "EasyClinic" direct_technique = "(. (LSI NT) (0 2))" transitive_technique = "(x (PCA GLOBAL) ((. (LSI NT) (0 1)) (. (LSI NT) (1 2))))" hybrid_technique = f"(o (MAX) ({direct_technique} {transitive_technique}))" technique_definitions = [ ("direct", direct_technique), ("transitive", transitive_technique), ("hybrid", hybrid_technique), ] metric_table = MetricTable() tracer = Tracer() for t_name, t_def in technique_definitions: t_metrics = tracer.get_metrics(dataset_name, t_def) metric_table.add(t_metrics, {"name": t_name}) print(metric_table.table)
def run(self) -> Table: """ Returns a metric table containing all of the metrics calculated for each technique in df :return: metric table with single query metrics for each technique applied to specified dataset in row """ dataset_name = prompt_for_dataset() """ Find best techniques """ direct_best_definition = get_best_direct_technique(dataset_name) transitive_best_definition = get_best_transitive_technique(dataset_name) combined_best_definition = get_best_hybrid_technique(dataset_name) """ Calculate metrics for individual queries on dataset """ tracer = Tracer() metric_table = MetricTable() direct_metrics: [Metrics] = tracer.get_metrics( dataset_name, direct_best_definition, summary_metrics=False ) metric_table.add( direct_metrics, other={TECHNIQUE_TYPE_COLNAME: DIRECT_ID}, create_index=True ) transitive_metrics: [Metrics] = tracer.get_metrics( dataset_name, transitive_best_definition, summary_metrics=False ) metric_table.add( transitive_metrics, other={TECHNIQUE_TYPE_COLNAME: TRANSITIVE_ID}, create_index=True, ) combined_metrics: [Metrics] = tracer.get_metrics( dataset_name, combined_best_definition, summary_metrics=False ) metric_table.add( combined_metrics, other={TECHNIQUE_TYPE_COLNAME: HYBRID_ID}, create_index=True, ) """ Export individual run """ export_path = os.path.join(PATH_TO_INDIVIDUAL_QUERIES, dataset_name + ".csv") (metric_table.sort(DATASET_COLUMN_ORDER).save(export_path)) self.export_paths.append(export_path) """ Update aggregate """ individual_queries_aggregate = ( MetricTable( Table.aggregate_intermediate_files(PATH_TO_INDIVIDUAL_QUERIES).table ) .create_lag_norm_inverted(drop_old=True) .melt_metrics(metric_value_col_name=METRIC_SCORE_COLNAME) .sort(DATASET_COLUMN_ORDER) .col_values_to_upper(METRIC_COLNAME) .to_title_case(exclude=METRIC_COLNAME) .save(PATH_TO_INDIVIDUAL_QUERIES_AGG) ) individual_queries_aggregate = ( MetricTable( Table.aggregate_intermediate_files(PATH_TO_INDIVIDUAL_QUERIES).table ) .create_lag_norm_inverted(drop_old=True) .sort(DATASET_COLUMN_ORDER) .save(PATH_TO_INDIVIDUAL_QUERIES_UNMELTED) ) # aggregate_table self.export_paths.append(PATH_TO_INDIVIDUAL_QUERIES_AGG) return individual_queries_aggregate