def run(self) -> Table: agg_metric_table = MetricTable( path_to_table=PATH_TO_METRIC_TABLE_AGGREGATE) H_PATH = os.path.join(PATH_TO_GAIN, "hybrid_over_direct.csv") hybrid_gain_table = ( agg_metric_table.calculate_gain_between_techniques( create_comparison_dict()).sort(DATASET_COLUMN_ORDER).save( H_PATH)) T_PATH = os.path.join(PATH_TO_GAIN, "transitive_over_direct.csv") transitive_gain_table = ( agg_metric_table.calculate_gain_between_techniques( create_comparison_dict_transitive()).sort( DATASET_COLUMN_ORDER).save(T_PATH)) H_OVER_T_PATH = os.path.join(PATH_TO_GAIN, "hybrid_over_transitive.csv") hybrid_over_transitive_gain = ( agg_metric_table.calculate_gain_between_techniques( create_comparison_dict_hybrid_over_transitive()).sort( DATASET_COLUMN_ORDER).save(H_OVER_T_PATH)) self.export_paths.append(H_PATH) self.export_paths.append(T_PATH) self.export_paths.append(H_OVER_T_PATH) # rq2_gain_df.save(PATH_TO_RQ2_GAIN) self.export_paths.append(PATH_TO_RQ2_GAIN) return agg_metric_table
def run(self) -> Table: """ calculates metric table for all techniques and applies post processing techinques defined in module :return: metric table with metrics """ Cache.CACHE_ON = True dataset_name = prompt_for_dataset() metric_table = calculate_technique_metric_table(dataset_name) metric_table.sort(DATASET_COLUMN_ORDER).save( create_export_path(dataset_name)) # export metric table aggregate_metric_table = MetricTable( Table.aggregate_intermediate_files(PATH_TO_METRIC_TABLES).sort( DATASET_COLUMN_ORDER).table).save( PATH_TO_METRIC_TABLE_AGGREGATE) # create graphable metrics and export table aggregate_metric_table.create_lag_norm_inverted( drop_old=True).melt_metrics().col_values_to_upper( METRIC_COLNAME).save(PATH_TO_GRAPH_METRIC_TABLE_AGGREGATE) self.export_paths.append(create_export_path(dataset_name)) self.export_paths.append(PATH_TO_METRIC_TABLE_AGGREGATE) Cache.cleanup(dataset_name) return metric_table
def test_crate_lag_norm_inverted(self): metric_table = MetricTable(self.data) values = metric_table.create_lag_norm_inverted().table.set_index( [DATASET_COLNAME, NAME_COLNAME]) for d_name in self.DATASETS: for name, e_value in self.expected_values: value = values.loc[(d_name, name)][LAG_NORMALIZED_INVERTED_COLNAME] self.assertEqual(e_value, value)
def assert_scores(self, table, expected_values): metric_table = MetricTable(table) gain = metric_table.calculate_gain_between_techniques({ self.TEST_DATASET_NAME: ("old", "new") }).table self.assertEqual(1, len(gain)) gain_entry = gain.iloc[0] for m_name, e_value in expected_values: self.assertEqual(round(e_value, N_SIG_FIGS), gain_entry[m_name])
def test_percent_best(self): """ Tests that for each trace_type (T1, T2), each scaling method (M1, M2) vote is distributed equally among both datasets as displayed in the data. :return: """ percent_best_table = ( MetricTable(self.data).calculate_percent_best().sort(["D1", "D2"]).table ) percent_best_table = percent_best_table.set_index( [TRANSITIVE_TRACE_TYPE_COLNAME, TECHNIQUE_COLNAME] ) self.assertEqual(4, len(percent_best_table)) self.assertEqual( 0.5, percent_best_table.loc[("T1", "M1")][PERCENT_BEST_COLNAME] ) self.assertEqual( 0.5, percent_best_table.loc[("T1", "M2")][PERCENT_BEST_COLNAME] ) self.assertEqual( 0.5, percent_best_table.loc[("T2", "M1")][PERCENT_BEST_COLNAME] ) self.assertEqual( 0.5, percent_best_table.loc[("T2", "M2")][PERCENT_BEST_COLNAME] )
def calculate_technique_metric_table(dataset: str) -> Table: """ Creates a metric table for each technique (direct, transitive, and combined) containing identifying information for each technique and the default set of accuracy metrics provided by Tracer engine. :param dataset: the name of the dataset :return: MetricTable - contains default accuracy metrics for techniques """ tracer = Tracer() metric_table = MetricTable() techniques = RetrievalTechniques() with create_loading_bar(EXPERIMENT_LOADING_MESSAGE, techniques, length=len(techniques)) as techniques: for t_name, t_entry in techniques: t_entry.update({NAME_COLNAME: t_name}) t_metrics = tracer.get_metrics(dataset, t_name) metric_table.add(t_metrics, t_entry) return metric_table
def run(self) -> Table: """ Calculates percent best on the aggregate metric table. :exports: 1. rq1 best 2. aggregate best (includes RQ1 and RQ2) :return: Table - aggregate best table """ # Read aggregate metric table rq1_aggregate = MetricTable( path_to_table=PATH_TO_METRIC_TABLE_AGGREGATE) best_df = pd.concat([ rq1_aggregate.find_best_direct_techniques().table, rq1_aggregate.find_best_transitive_techniques().table, rq1_aggregate.find_best_hybrid_techniques().table, ]) worst_df = pd.concat([ rq1_aggregate.find_worst_direct_techniques().table, rq1_aggregate.find_worst_transitive_techniques().table, rq1_aggregate.find_worst_hybrid_techniques().table, ]) for title, df in [("name", best_df), ("worst", worst_df)]: print(f"{title}-----------") for i in range(len(df)): entry = df.iloc[i] print( "%20s %10s %10s" % (entry["dataset"], entry["technique_type"], entry["name"])) def get_latex_row(data_df): metric_values = [] for dataset_name in DATASET_COLUMN_ORDER: dataset_entries = data_df[ (data_df[DATASET_COLNAME] == dataset_name) & (data_df[TECHNIQUE_TYPE_COLNAME] == HYBRID_ID)] entry = dataset_entries.iloc[0] for metric_name in BEST_TECHNIQUE_AGGREGATE_METRICS: metric_value = entry[metric_name] metric_values.append(str(round(metric_value, 3))) return "&".join(metric_values) best_row = get_latex_row(best_df) print("-" * 50) worst_row = get_latex_row(worst_df) print("BEST:", best_row) print("WORST:", worst_row)
def test_create_ranks_with_single_dataset(self): """ Tests that ranks are accurate in respect to data containing a single dataset with no dataset column. :return: None """ single_dataset = self.data.iloc[:4].drop(DATASET_COLNAME, axis=1) ranked_table = MetricTable(single_dataset).create_ranks().table self.assertEqual(2, ranked_table[RANK_COLNAME][0]) self.assertEqual(1, ranked_table[RANK_COLNAME][1]) self.assertEqual(2, ranked_table[RANK_COLNAME][2]) self.assertEqual(1, ranked_table[RANK_COLNAME][3])
def assert_technique_extracted(self, name: str, expected_scores: List[float]): metric_table = MetricTable(self.data) for family_name in self.technique_ids: table_method = getattr( metric_table, f"find_{name}_{family_name.lower()}_techniques" ) technique_query = table_method().table self.assertEqual(1, len(technique_query)) technique_entry = technique_query.iloc[0] for m_index, m_name in enumerate(BEST_TECHNIQUE_AGGREGATE_METRICS): self.assertEqual( expected_scores[m_index], technique_entry[m_name], )
def test_create_ranks_with_all_datasets(self): """ Tests that ranks are accurate in respect to dataset x trace type groups. :return: None """ ranked_table = MetricTable(self.data).create_ranks().table self.assertTrue(RANK_COLNAME in ranked_table.columns) self.assertEqual(2, ranked_table[RANK_COLNAME][0]) self.assertEqual(1, ranked_table[RANK_COLNAME][1]) self.assertEqual(2, ranked_table[RANK_COLNAME][2]) self.assertEqual(1, ranked_table[RANK_COLNAME][3]) self.assertEqual(1, ranked_table[RANK_COLNAME][4]) self.assertEqual(2, ranked_table[RANK_COLNAME][5]) self.assertEqual(1, ranked_table[RANK_COLNAME][6]) self.assertEqual(2, ranked_table[RANK_COLNAME][7])
def run(self) -> Table: """ Returns a metric table containing all of the metrics calculated for each technique in df :return: metric table with single query metrics for each technique applied to specified dataset in row """ tracer = Tracer() metric_table = MetricTable() for dataset_name in DATASET_COLUMN_ORDER: hybrid_query_metrics: List[Metrics] = tracer.get_metrics( dataset_name, BEST_OVERALL_TECHNIQUE, summary_metrics=False) metric_table.add( hybrid_query_metrics, other={ DATASET_COLNAME: dataset_name, TECHNIQUE_TYPE_COLNAME: HYBRID_ID, }, create_index=True, ) direct_query_metrics: List[Metrics] = tracer.get_metrics( dataset_name, get_best_direct_technique(dataset_name), summary_metrics=False, ) metric_table.add( direct_query_metrics, other={ DATASET_COLNAME: dataset_name, TECHNIQUE_TYPE_COLNAME: DIRECT_ID, }, create_index=True, ) individual_queries_aggregate = (metric_table.create_lag_norm_inverted( drop_old=True).melt_metrics( metric_value_col_name=METRIC_SCORE_COLNAME).sort( DATASET_COLUMN_ORDER).col_values_to_upper( METRIC_COLNAME).save(EXPORT_PATH)) self.export_paths.append(PATH_TO_INDIVIDUAL_QUERIES_AGG) return individual_queries_aggregate
from api.tables.metric_table import MetricTable from api.technique.variationpoints.aggregation.aggregation_method import ( AggregationMethod, ) from api.technique.variationpoints.algebraicmodel.models import AlgebraicModel from api.technique.variationpoints.scalers.scaling_method import ScalingMethod from utilities.constants import DATASET_COLUMN_ORDER, PATH_TO_PRESENTATION from utilities.technique_extractors import AGGREGATE_METRIC_TABLE if __name__ == "__main__": data = AGGREGATE_METRIC_TABLE.table mask = (AGGREGATE_METRIC_TABLE.get_technique_type_mask(DIRECT_ID) | AGGREGATE_METRIC_TABLE.get_technique_type_mask(TRANSITIVE_ID) ) & (data[TRANSITIVE_TRACE_TYPE_COLNAME].isin(["direct", "none"])) masked_data = AGGREGATE_METRIC_TABLE.table[mask].reset_index(drop=True) table_metrics = (MetricTable(masked_data).create_lag_norm_inverted( drop_old=True).table) column_orders = { DIRECT_ALGEBRAIC_MODEL_COLNAME: [ AlgebraicModel.VSM.value, AlgebraicModel.LSI.value, ], TRANSITIVE_ALGEBRAIC_MODEL_COLNAME: [ AlgebraicModel.VSM.value, AlgebraicModel.LSI.value, ], TRANSITIVE_SCALING_COLNAME: [ ScalingMethod.INDEPENDENT.value, ScalingMethod.GLOBAL.value, ], TRANSITIVE_AGGREGATION_COLNAME: [
from api.constants.processing import DATASET_COLNAME, NAME_COLNAME from api.constants.techniques import DIRECT_ID, HYBRID_ID, TRANSITIVE_ID from api.tables.metric_table import MetricTable from api.technique.definitions.combined.technique import ( CombinedTechnique, create_technique_from_name, ) from api.technique.definitions.direct.technique import DirectTechnique from api.technique.definitions.sampled.definition import SAMPLED_COMMAND_SYMBOL from api.technique.definitions.transitive.definition import TRANSITIVE_COMMAND_SYMBOL from api.technique.definitions.transitive.technique import TransitiveTechnique from api.technique.parser.itechnique import ITechnique from utilities.constants import PATH_TO_METRIC_TABLE_AGGREGATE AGGREGATE_METRIC_TABLE = MetricTable(path_to_table=PATH_TO_METRIC_TABLE_AGGREGATE) def get_best_direct_technique(dataset_name: str) -> str: """ Find the best direct techniques and returns the one corresponding to given dataset. :param dataset_name: the dataset that whose best technique we are after :return: string - technique definition """ best_df = AGGREGATE_METRIC_TABLE.find_best_direct_techniques().table.set_index( DATASET_COLNAME ) if dataset_name not in best_df.index: raise Exception( f"Expected {dataset_name} to have metrics in {PATH_TO_METRIC_TABLE_AGGREGATE}" )
def run(self) -> Table: tracer = Tracer() def get_metrics(d_name, t_def: str): return tracer.get_metrics(d_name, t_def) def add_metrics(d_name, t_def: str, t_type: str, p_name: str): t_metrics = get_metrics(d_name, t_def) metric_table.add( t_metrics, { DATASET_COLNAME: d_name, "path": p_name, "type": t_type, NAME_COLNAME: t_def, }, ) aggregate_gain = None aggregate_metric = None for path in POSSIBLE_PATHS: metric_table = MetricTable() comparison_dict = {} path_name = path_to_str(path) for dataset_name in DATASET_COLUMN_ORDER: source_index = str(path[0]) intermediate_index = str(path[1]) target_index = str(path[2]) new_path = [source_index, intermediate_index, target_index] # direct direct_technique_def = change_paths_in_technique( get_best_direct_technique(dataset_name), new_path) add_metrics( dataset_name, direct_technique_def, DIRECT_ID, path_name, ) # transitive transitive_technique_def = change_paths_in_technique( get_best_transitive_technique(dataset_name), new_path) add_metrics( dataset_name, transitive_technique_def, TRANSITIVE_ID, path_name, ) # HYBRID hybrid_technique_definition = change_paths_in_technique( get_best_hybrid_technique(dataset_name), new_path) add_metrics( dataset_name, hybrid_technique_definition, HYBRID_ID, path_name, ) comparison_dict.update({ dataset_name: (direct_technique_def, hybrid_technique_definition) }) gain_table = metric_table.calculate_gain_between_techniques( comparison_dict) gain_table.table["path"] = path_name aggregate_gain = (gain_table.table if aggregate_gain is None else pd.concat([gain_table.table, aggregate_gain])) aggregate_metric = (metric_table.table if aggregate_metric is None else pd.concat( [metric_table.table, aggregate_metric])) MetricTable(aggregate_metric).create_lag_norm_inverted( drop_old=True).melt_metrics().save(METRIC_TABLE_EXPORT_PATH) self.export_paths.append(METRIC_TABLE_EXPORT_PATH) MetricTable(aggregate_gain).melt_metrics().save( GAIN_TABLE_EXPORT_PATH) self.export_paths.append(GAIN_TABLE_EXPORT_PATH) return aggregate_gain
def test_gain(self): metric_table = MetricTable(self.data) gain = metric_table.calculate_gain_between_techniques( {self.TEST_DATASET_NAME: ("old", "new")} ) print(gain)
from api.tables.metric_table import MetricTable from api.tracer import Tracer if __name__ == "__main__": dataset_name = "EasyClinic" direct_technique = "(. (LSI NT) (0 2))" transitive_technique = "(x (PCA GLOBAL) ((. (LSI NT) (0 1)) (. (LSI NT) (1 2))))" hybrid_technique = f"(o (MAX) ({direct_technique} {transitive_technique}))" technique_definitions = [ ("direct", direct_technique), ("transitive", transitive_technique), ("hybrid", hybrid_technique), ] metric_table = MetricTable() tracer = Tracer() for t_name, t_def in technique_definitions: t_metrics = tracer.get_metrics(dataset_name, t_def) metric_table.add(t_metrics, {"name": t_name}) print(metric_table.table)
def run(self) -> Table: """ Returns a metric table containing all of the metrics calculated for each technique in df :return: metric table with single query metrics for each technique applied to specified dataset in row """ dataset_name = prompt_for_dataset() """ Find best techniques """ direct_best_definition = get_best_direct_technique(dataset_name) transitive_best_definition = get_best_transitive_technique(dataset_name) combined_best_definition = get_best_hybrid_technique(dataset_name) """ Calculate metrics for individual queries on dataset """ tracer = Tracer() metric_table = MetricTable() direct_metrics: [Metrics] = tracer.get_metrics( dataset_name, direct_best_definition, summary_metrics=False ) metric_table.add( direct_metrics, other={TECHNIQUE_TYPE_COLNAME: DIRECT_ID}, create_index=True ) transitive_metrics: [Metrics] = tracer.get_metrics( dataset_name, transitive_best_definition, summary_metrics=False ) metric_table.add( transitive_metrics, other={TECHNIQUE_TYPE_COLNAME: TRANSITIVE_ID}, create_index=True, ) combined_metrics: [Metrics] = tracer.get_metrics( dataset_name, combined_best_definition, summary_metrics=False ) metric_table.add( combined_metrics, other={TECHNIQUE_TYPE_COLNAME: HYBRID_ID}, create_index=True, ) """ Export individual run """ export_path = os.path.join(PATH_TO_INDIVIDUAL_QUERIES, dataset_name + ".csv") (metric_table.sort(DATASET_COLUMN_ORDER).save(export_path)) self.export_paths.append(export_path) """ Update aggregate """ individual_queries_aggregate = ( MetricTable( Table.aggregate_intermediate_files(PATH_TO_INDIVIDUAL_QUERIES).table ) .create_lag_norm_inverted(drop_old=True) .melt_metrics(metric_value_col_name=METRIC_SCORE_COLNAME) .sort(DATASET_COLUMN_ORDER) .col_values_to_upper(METRIC_COLNAME) .to_title_case(exclude=METRIC_COLNAME) .save(PATH_TO_INDIVIDUAL_QUERIES_AGG) ) individual_queries_aggregate = ( MetricTable( Table.aggregate_intermediate_files(PATH_TO_INDIVIDUAL_QUERIES).table ) .create_lag_norm_inverted(drop_old=True) .sort(DATASET_COLUMN_ORDER) .save(PATH_TO_INDIVIDUAL_QUERIES_UNMELTED) ) # aggregate_table self.export_paths.append(PATH_TO_INDIVIDUAL_QUERIES_AGG) return individual_queries_aggregate