def run(self) -> Table:
        agg_metric_table = MetricTable(
            path_to_table=PATH_TO_METRIC_TABLE_AGGREGATE)
        H_PATH = os.path.join(PATH_TO_GAIN, "hybrid_over_direct.csv")
        hybrid_gain_table = (
            agg_metric_table.calculate_gain_between_techniques(
                create_comparison_dict()).sort(DATASET_COLUMN_ORDER).save(
                    H_PATH))

        T_PATH = os.path.join(PATH_TO_GAIN, "transitive_over_direct.csv")
        transitive_gain_table = (
            agg_metric_table.calculate_gain_between_techniques(
                create_comparison_dict_transitive()).sort(
                    DATASET_COLUMN_ORDER).save(T_PATH))

        H_OVER_T_PATH = os.path.join(PATH_TO_GAIN,
                                     "hybrid_over_transitive.csv")
        hybrid_over_transitive_gain = (
            agg_metric_table.calculate_gain_between_techniques(
                create_comparison_dict_hybrid_over_transitive()).sort(
                    DATASET_COLUMN_ORDER).save(H_OVER_T_PATH))

        self.export_paths.append(H_PATH)
        self.export_paths.append(T_PATH)
        self.export_paths.append(H_OVER_T_PATH)

        # rq2_gain_df.save(PATH_TO_RQ2_GAIN)
        self.export_paths.append(PATH_TO_RQ2_GAIN)
        return agg_metric_table
Exemple #2
0
    def run(self) -> Table:
        """
        calculates metric table for all techniques and applies post processing techinques defined in module
        :return: metric table with metrics
        """
        Cache.CACHE_ON = True
        dataset_name = prompt_for_dataset()
        metric_table = calculate_technique_metric_table(dataset_name)
        metric_table.sort(DATASET_COLUMN_ORDER).save(
            create_export_path(dataset_name))

        # export metric table
        aggregate_metric_table = MetricTable(
            Table.aggregate_intermediate_files(PATH_TO_METRIC_TABLES).sort(
                DATASET_COLUMN_ORDER).table).save(
                    PATH_TO_METRIC_TABLE_AGGREGATE)

        # create graphable metrics and export table
        aggregate_metric_table.create_lag_norm_inverted(
            drop_old=True).melt_metrics().col_values_to_upper(
                METRIC_COLNAME).save(PATH_TO_GRAPH_METRIC_TABLE_AGGREGATE)

        self.export_paths.append(create_export_path(dataset_name))
        self.export_paths.append(PATH_TO_METRIC_TABLE_AGGREGATE)
        Cache.cleanup(dataset_name)
        return metric_table
 def test_crate_lag_norm_inverted(self):
     metric_table = MetricTable(self.data)
     values = metric_table.create_lag_norm_inverted().table.set_index(
         [DATASET_COLNAME, NAME_COLNAME])
     for d_name in self.DATASETS:
         for name, e_value in self.expected_values:
             value = values.loc[(d_name,
                                 name)][LAG_NORMALIZED_INVERTED_COLNAME]
             self.assertEqual(e_value, value)
Exemple #4
0
    def assert_scores(self, table, expected_values):
        metric_table = MetricTable(table)
        gain = metric_table.calculate_gain_between_techniques({
            self.TEST_DATASET_NAME: ("old", "new")
        }).table
        self.assertEqual(1, len(gain))
        gain_entry = gain.iloc[0]

        for m_name, e_value in expected_values:
            self.assertEqual(round(e_value, N_SIG_FIGS), gain_entry[m_name])
Exemple #5
0
    def test_percent_best(self):
        """
        Tests that for each trace_type (T1, T2), each scaling method (M1, M2) vote is distributed equally
        among both datasets as displayed in the data.
        :return:
        """
        percent_best_table = (
            MetricTable(self.data).calculate_percent_best().sort(["D1", "D2"]).table
        )

        percent_best_table = percent_best_table.set_index(
            [TRANSITIVE_TRACE_TYPE_COLNAME, TECHNIQUE_COLNAME]
        )

        self.assertEqual(4, len(percent_best_table))
        self.assertEqual(
            0.5, percent_best_table.loc[("T1", "M1")][PERCENT_BEST_COLNAME]
        )
        self.assertEqual(
            0.5, percent_best_table.loc[("T1", "M2")][PERCENT_BEST_COLNAME]
        )
        self.assertEqual(
            0.5, percent_best_table.loc[("T2", "M1")][PERCENT_BEST_COLNAME]
        )
        self.assertEqual(
            0.5, percent_best_table.loc[("T2", "M2")][PERCENT_BEST_COLNAME]
        )
Exemple #6
0
def calculate_technique_metric_table(dataset: str) -> Table:
    """
    Creates a metric table for each technique (direct, transitive, and combined) containing identifying information
    for each technique and the default set of accuracy metrics provided by Tracer engine.
    :param dataset: the name of the dataset
    :return: MetricTable - contains default accuracy metrics for techniques
    """
    tracer = Tracer()
    metric_table = MetricTable()

    techniques = RetrievalTechniques()
    with create_loading_bar(EXPERIMENT_LOADING_MESSAGE,
                            techniques,
                            length=len(techniques)) as techniques:
        for t_name, t_entry in techniques:
            t_entry.update({NAME_COLNAME: t_name})
            t_metrics = tracer.get_metrics(dataset, t_name)
            metric_table.add(t_metrics, t_entry)

    return metric_table
    def run(self) -> Table:
        """
        Calculates percent best on the aggregate metric table.

        :exports:
            1. rq1 best
            2. aggregate best (includes RQ1 and RQ2)
        :return: Table - aggregate best table
        """

        # Read aggregate metric table
        rq1_aggregate = MetricTable(
            path_to_table=PATH_TO_METRIC_TABLE_AGGREGATE)
        best_df = pd.concat([
            rq1_aggregate.find_best_direct_techniques().table,
            rq1_aggregate.find_best_transitive_techniques().table,
            rq1_aggregate.find_best_hybrid_techniques().table,
        ])
        worst_df = pd.concat([
            rq1_aggregate.find_worst_direct_techniques().table,
            rq1_aggregate.find_worst_transitive_techniques().table,
            rq1_aggregate.find_worst_hybrid_techniques().table,
        ])

        for title, df in [("name", best_df), ("worst", worst_df)]:
            print(f"{title}-----------")
            for i in range(len(df)):
                entry = df.iloc[i]
                print(
                    "%20s %10s %10s" %
                    (entry["dataset"], entry["technique_type"], entry["name"]))

        def get_latex_row(data_df):
            metric_values = []
            for dataset_name in DATASET_COLUMN_ORDER:
                dataset_entries = data_df[
                    (data_df[DATASET_COLNAME] == dataset_name)
                    & (data_df[TECHNIQUE_TYPE_COLNAME] == HYBRID_ID)]
                entry = dataset_entries.iloc[0]
                for metric_name in BEST_TECHNIQUE_AGGREGATE_METRICS:
                    metric_value = entry[metric_name]
                    metric_values.append(str(round(metric_value, 3)))
            return "&".join(metric_values)

        best_row = get_latex_row(best_df)
        print("-" * 50)
        worst_row = get_latex_row(worst_df)

        print("BEST:", best_row)
        print("WORST:", worst_row)
Exemple #8
0
    def test_create_ranks_with_single_dataset(self):
        """
        Tests that ranks are accurate in respect to data containing a single dataset
        with no dataset column.
        :return: None
        """
        single_dataset = self.data.iloc[:4].drop(DATASET_COLNAME, axis=1)
        ranked_table = MetricTable(single_dataset).create_ranks().table

        self.assertEqual(2, ranked_table[RANK_COLNAME][0])
        self.assertEqual(1, ranked_table[RANK_COLNAME][1])
        self.assertEqual(2, ranked_table[RANK_COLNAME][2])
        self.assertEqual(1, ranked_table[RANK_COLNAME][3])
 def assert_technique_extracted(self, name: str, expected_scores: List[float]):
     metric_table = MetricTable(self.data)
     for family_name in self.technique_ids:
         table_method = getattr(
             metric_table, f"find_{name}_{family_name.lower()}_techniques"
         )
         technique_query = table_method().table
         self.assertEqual(1, len(technique_query))
         technique_entry = technique_query.iloc[0]
         for m_index, m_name in enumerate(BEST_TECHNIQUE_AGGREGATE_METRICS):
             self.assertEqual(
                 expected_scores[m_index],
                 technique_entry[m_name],
             )
Exemple #10
0
    def test_create_ranks_with_all_datasets(self):
        """
        Tests that ranks are accurate in respect to dataset x trace type groups.
        :return: None
        """
        ranked_table = MetricTable(self.data).create_ranks().table
        self.assertTrue(RANK_COLNAME in ranked_table.columns)

        self.assertEqual(2, ranked_table[RANK_COLNAME][0])
        self.assertEqual(1, ranked_table[RANK_COLNAME][1])
        self.assertEqual(2, ranked_table[RANK_COLNAME][2])
        self.assertEqual(1, ranked_table[RANK_COLNAME][3])
        self.assertEqual(1, ranked_table[RANK_COLNAME][4])
        self.assertEqual(2, ranked_table[RANK_COLNAME][5])
        self.assertEqual(1, ranked_table[RANK_COLNAME][6])
        self.assertEqual(2, ranked_table[RANK_COLNAME][7])
    def run(self) -> Table:
        """
        Returns a metric table containing all of the metrics calculated for each technique in df
        :return: metric table with single query metrics for each technique applied to specified dataset in row
        """
        tracer = Tracer()
        metric_table = MetricTable()

        for dataset_name in DATASET_COLUMN_ORDER:
            hybrid_query_metrics: List[Metrics] = tracer.get_metrics(
                dataset_name, BEST_OVERALL_TECHNIQUE, summary_metrics=False)
            metric_table.add(
                hybrid_query_metrics,
                other={
                    DATASET_COLNAME: dataset_name,
                    TECHNIQUE_TYPE_COLNAME: HYBRID_ID,
                },
                create_index=True,
            )

            direct_query_metrics: List[Metrics] = tracer.get_metrics(
                dataset_name,
                get_best_direct_technique(dataset_name),
                summary_metrics=False,
            )
            metric_table.add(
                direct_query_metrics,
                other={
                    DATASET_COLNAME: dataset_name,
                    TECHNIQUE_TYPE_COLNAME: DIRECT_ID,
                },
                create_index=True,
            )

        individual_queries_aggregate = (metric_table.create_lag_norm_inverted(
            drop_old=True).melt_metrics(
                metric_value_col_name=METRIC_SCORE_COLNAME).sort(
                    DATASET_COLUMN_ORDER).col_values_to_upper(
                        METRIC_COLNAME).save(EXPORT_PATH))

        self.export_paths.append(PATH_TO_INDIVIDUAL_QUERIES_AGG)

        return individual_queries_aggregate
Exemple #12
0
from api.tables.metric_table import MetricTable
from api.technique.variationpoints.aggregation.aggregation_method import (
    AggregationMethod, )
from api.technique.variationpoints.algebraicmodel.models import AlgebraicModel
from api.technique.variationpoints.scalers.scaling_method import ScalingMethod
from utilities.constants import DATASET_COLUMN_ORDER, PATH_TO_PRESENTATION
from utilities.technique_extractors import AGGREGATE_METRIC_TABLE

if __name__ == "__main__":
    data = AGGREGATE_METRIC_TABLE.table
    mask = (AGGREGATE_METRIC_TABLE.get_technique_type_mask(DIRECT_ID)
            | AGGREGATE_METRIC_TABLE.get_technique_type_mask(TRANSITIVE_ID)
            ) & (data[TRANSITIVE_TRACE_TYPE_COLNAME].isin(["direct", "none"]))

    masked_data = AGGREGATE_METRIC_TABLE.table[mask].reset_index(drop=True)
    table_metrics = (MetricTable(masked_data).create_lag_norm_inverted(
        drop_old=True).table)

    column_orders = {
        DIRECT_ALGEBRAIC_MODEL_COLNAME: [
            AlgebraicModel.VSM.value,
            AlgebraicModel.LSI.value,
        ],
        TRANSITIVE_ALGEBRAIC_MODEL_COLNAME: [
            AlgebraicModel.VSM.value,
            AlgebraicModel.LSI.value,
        ],
        TRANSITIVE_SCALING_COLNAME: [
            ScalingMethod.INDEPENDENT.value,
            ScalingMethod.GLOBAL.value,
        ],
        TRANSITIVE_AGGREGATION_COLNAME: [
from api.constants.processing import DATASET_COLNAME, NAME_COLNAME
from api.constants.techniques import DIRECT_ID, HYBRID_ID, TRANSITIVE_ID
from api.tables.metric_table import MetricTable
from api.technique.definitions.combined.technique import (
    CombinedTechnique,
    create_technique_from_name,
)
from api.technique.definitions.direct.technique import DirectTechnique
from api.technique.definitions.sampled.definition import SAMPLED_COMMAND_SYMBOL
from api.technique.definitions.transitive.definition import TRANSITIVE_COMMAND_SYMBOL
from api.technique.definitions.transitive.technique import TransitiveTechnique
from api.technique.parser.itechnique import ITechnique
from utilities.constants import PATH_TO_METRIC_TABLE_AGGREGATE

AGGREGATE_METRIC_TABLE = MetricTable(path_to_table=PATH_TO_METRIC_TABLE_AGGREGATE)


def get_best_direct_technique(dataset_name: str) -> str:
    """
    Find the best direct techniques and returns the one corresponding to given dataset.
    :param dataset_name: the dataset that whose best technique we are after
    :return: string - technique definition
    """
    best_df = AGGREGATE_METRIC_TABLE.find_best_direct_techniques().table.set_index(
        DATASET_COLNAME
    )
    if dataset_name not in best_df.index:
        raise Exception(
            f"Expected {dataset_name} to have metrics in {PATH_TO_METRIC_TABLE_AGGREGATE}"
        )
Exemple #14
0
    def run(self) -> Table:
        tracer = Tracer()

        def get_metrics(d_name, t_def: str):
            return tracer.get_metrics(d_name, t_def)

        def add_metrics(d_name, t_def: str, t_type: str, p_name: str):
            t_metrics = get_metrics(d_name, t_def)
            metric_table.add(
                t_metrics,
                {
                    DATASET_COLNAME: d_name,
                    "path": p_name,
                    "type": t_type,
                    NAME_COLNAME: t_def,
                },
            )

        aggregate_gain = None
        aggregate_metric = None
        for path in POSSIBLE_PATHS:
            metric_table = MetricTable()
            comparison_dict = {}
            path_name = path_to_str(path)

            for dataset_name in DATASET_COLUMN_ORDER:
                source_index = str(path[0])
                intermediate_index = str(path[1])
                target_index = str(path[2])

                new_path = [source_index, intermediate_index, target_index]

                # direct
                direct_technique_def = change_paths_in_technique(
                    get_best_direct_technique(dataset_name), new_path)
                add_metrics(
                    dataset_name,
                    direct_technique_def,
                    DIRECT_ID,
                    path_name,
                )

                # transitive
                transitive_technique_def = change_paths_in_technique(
                    get_best_transitive_technique(dataset_name), new_path)
                add_metrics(
                    dataset_name,
                    transitive_technique_def,
                    TRANSITIVE_ID,
                    path_name,
                )

                # HYBRID
                hybrid_technique_definition = change_paths_in_technique(
                    get_best_hybrid_technique(dataset_name), new_path)
                add_metrics(
                    dataset_name,
                    hybrid_technique_definition,
                    HYBRID_ID,
                    path_name,
                )
                comparison_dict.update({
                    dataset_name:
                    (direct_technique_def, hybrid_technique_definition)
                })
            gain_table = metric_table.calculate_gain_between_techniques(
                comparison_dict)
            gain_table.table["path"] = path_name

            aggregate_gain = (gain_table.table if aggregate_gain is None else
                              pd.concat([gain_table.table, aggregate_gain]))

            aggregate_metric = (metric_table.table
                                if aggregate_metric is None else pd.concat(
                                    [metric_table.table, aggregate_metric]))

            MetricTable(aggregate_metric).create_lag_norm_inverted(
                drop_old=True).melt_metrics().save(METRIC_TABLE_EXPORT_PATH)
            self.export_paths.append(METRIC_TABLE_EXPORT_PATH)

            MetricTable(aggregate_gain).melt_metrics().save(
                GAIN_TABLE_EXPORT_PATH)
            self.export_paths.append(GAIN_TABLE_EXPORT_PATH)
        return aggregate_gain
 def test_gain(self):
     metric_table = MetricTable(self.data)
     gain = metric_table.calculate_gain_between_techniques(
         {self.TEST_DATASET_NAME: ("old", "new")}
     )
     print(gain)
Exemple #16
0
from api.tables.metric_table import MetricTable
from api.tracer import Tracer

if __name__ == "__main__":
    dataset_name = "EasyClinic"
    direct_technique = "(. (LSI NT) (0 2))"
    transitive_technique = "(x (PCA GLOBAL) ((. (LSI NT) (0 1)) (. (LSI NT) (1 2))))"
    hybrid_technique = f"(o (MAX) ({direct_technique} {transitive_technique}))"

    technique_definitions = [
        ("direct", direct_technique),
        ("transitive", transitive_technique),
        ("hybrid", hybrid_technique),
    ]

    metric_table = MetricTable()
    tracer = Tracer()

    for t_name, t_def in technique_definitions:
        t_metrics = tracer.get_metrics(dataset_name, t_def)
        metric_table.add(t_metrics, {"name": t_name})

    print(metric_table.table)
    def run(self) -> Table:
        """
        Returns a metric table containing all of the metrics calculated for each technique in df
        :return: metric table with single query metrics for each technique applied to specified dataset in row
        """
        dataset_name = prompt_for_dataset()

        """
        Find best techniques
        """
        direct_best_definition = get_best_direct_technique(dataset_name)
        transitive_best_definition = get_best_transitive_technique(dataset_name)
        combined_best_definition = get_best_hybrid_technique(dataset_name)

        """
        Calculate metrics for individual queries on dataset
        """
        tracer = Tracer()
        metric_table = MetricTable()

        direct_metrics: [Metrics] = tracer.get_metrics(
            dataset_name, direct_best_definition, summary_metrics=False
        )
        metric_table.add(
            direct_metrics, other={TECHNIQUE_TYPE_COLNAME: DIRECT_ID}, create_index=True
        )

        transitive_metrics: [Metrics] = tracer.get_metrics(
            dataset_name, transitive_best_definition, summary_metrics=False
        )
        metric_table.add(
            transitive_metrics,
            other={TECHNIQUE_TYPE_COLNAME: TRANSITIVE_ID},
            create_index=True,
        )

        combined_metrics: [Metrics] = tracer.get_metrics(
            dataset_name, combined_best_definition, summary_metrics=False
        )
        metric_table.add(
            combined_metrics,
            other={TECHNIQUE_TYPE_COLNAME: HYBRID_ID},
            create_index=True,
        )

        """
        Export individual run
        """
        export_path = os.path.join(PATH_TO_INDIVIDUAL_QUERIES, dataset_name + ".csv")
        (metric_table.sort(DATASET_COLUMN_ORDER).save(export_path))
        self.export_paths.append(export_path)

        """
        Update aggregate
        """

        individual_queries_aggregate = (
            MetricTable(
                Table.aggregate_intermediate_files(PATH_TO_INDIVIDUAL_QUERIES).table
            )
            .create_lag_norm_inverted(drop_old=True)
            .melt_metrics(metric_value_col_name=METRIC_SCORE_COLNAME)
            .sort(DATASET_COLUMN_ORDER)
            .col_values_to_upper(METRIC_COLNAME)
            .to_title_case(exclude=METRIC_COLNAME)
            .save(PATH_TO_INDIVIDUAL_QUERIES_AGG)
        )

        individual_queries_aggregate = (
            MetricTable(
                Table.aggregate_intermediate_files(PATH_TO_INDIVIDUAL_QUERIES).table
            )
            .create_lag_norm_inverted(drop_old=True)
            .sort(DATASET_COLUMN_ORDER)
            .save(PATH_TO_INDIVIDUAL_QUERIES_UNMELTED)
        )

        # aggregate_table
        self.export_paths.append(PATH_TO_INDIVIDUAL_QUERIES_AGG)

        return individual_queries_aggregate