Ejemplo n.º 1
0
    def run(self) -> Table:
        """
        calculates metric table for all techniques and applies post processing techinques defined in module
        :return: metric table with metrics
        """
        Cache.CACHE_ON = True
        dataset_name = prompt_for_dataset()
        metric_table = calculate_technique_metric_table(dataset_name)
        metric_table.sort(DATASET_COLUMN_ORDER).save(
            create_export_path(dataset_name))

        # export metric table
        aggregate_metric_table = MetricTable(
            Table.aggregate_intermediate_files(PATH_TO_METRIC_TABLES).sort(
                DATASET_COLUMN_ORDER).table).save(
                    PATH_TO_METRIC_TABLE_AGGREGATE)

        # create graphable metrics and export table
        aggregate_metric_table.create_lag_norm_inverted(
            drop_old=True).melt_metrics().col_values_to_upper(
                METRIC_COLNAME).save(PATH_TO_GRAPH_METRIC_TABLE_AGGREGATE)

        self.export_paths.append(create_export_path(dataset_name))
        self.export_paths.append(PATH_TO_METRIC_TABLE_AGGREGATE)
        Cache.cleanup(dataset_name)
        return metric_table
Ejemplo n.º 2
0
    def run(self) -> Table:

        print(WELCOME_MESSAGE, end="\n\n")

        while True:
            experiment_name = click.prompt(
                "What experiment would you like to run?",
                type=click.Choice(REGISTERED_EXPERIMENT_NAMES + [EXIT_COMMAND],
                                  case_sensitive=False),
            )
            if experiment_name == EXIT_COMMAND:
                print("\n\nGoodbye!")
                break
            print(EXPERIMENT_RUN_DELIMITER)
            print("Running Experiment: %s" % experiment_name)
            experiment = EXPERIMENT_NAME_MAP[experiment_name]()
            result = experiment.run()
            for e_path in experiment.export_paths:
                print(
                    "Exported: ",
                    os.path.normpath(
                        os.path.relpath(e_path,
                                        start=os.path.join(PATH_TO_ROOT,
                                                           ".."))),
                )
            print(EXPERIMENT_RUN_DELIMITER)
        return Table()
Ejemplo n.º 3
0
    def create_correlation_table(self) -> "Table":
        """
        :return: Table containing columns describing the correlation and p-value for each dataset-metric combination.
        """
        data = self.table.copy()
        correlation_df = pd.DataFrame()
        metrics = data[METRIC_COLNAME].unique()
        datasets = data[DATASET_COLNAME].unique()

        queryable = data.set_index([DATASET_COLNAME, METRIC_COLNAME])
        for dataset_name in datasets:
            for metric_name in metrics:
                query = queryable.loc[dataset_name, metric_name]

                metric_values: List[float] = list(query["value"])
                percent_values: List[float] = list(query["percent"])

                correlation, p_value = spearmanr(metric_values, percent_values)
                correlation = (-1 * correlation if metric_name
                               in INVERTED_METRICS else correlation)
                correlation_df = correlation_df.append(
                    {
                        DATASET_COLNAME:
                        dataset_name,
                        METRIC_COLNAME:
                        metric_name.lower(),
                        CORRELATION_COLNAME:
                        round(correlation, N_SIG_FIGS),
                        P_VALUE_COLNAME:
                        "<0.001" if p_value < 0.001 else str(
                            round(p_value, N_SIG_FIGS)),
                    },
                    ignore_index=True,
                )
        return Table(correlation_df)
Ejemplo n.º 4
0
 def run(self) -> Table:
     with create_loading_bar(
         EXPERIMENT_NAME, DATASET_COLUMN_ORDER, len(DATASET_COLUMN_ORDER)
     ) as d_iterable:
         for dataset_name in d_iterable:
             builder = DatasetBuilder(dataset_name)
             builder.build()
             builder.export()
             print(f"{dataset_name} exported.")
     return Table()
Ejemplo n.º 5
0
    def calculate_percent_best(self) -> Table:
        """
        For each transitive trace type and variation point, calculates the percent of times it had a rank of 1 across
        all datasets. Missing groups columns are ignored.
        :return:
        """
        data = self.create_ranks().table.copy()

        # 1. extract variation points (e.g. AlgebraicModel, TraceType, ect.)
        non_vp_columns = (ALL_METRIC_NAMES + META_COLS + [
            RANK_COLNAME, TECHNIQUE_TYPE_COLNAME, TRANSITIVE_TRACE_TYPE_COLNAME
        ])
        vp_cols = [col for col in data.columns if col not in non_vp_columns]

        percent_best_df = pd.DataFrame()
        n_datasets = (len(data[DATASET_COLNAME].unique())
                      if DATASET_COLNAME in data.columns else 1)

        group_by_cols_in_dataset = [
            col for col in [
                TRANSITIVE_TRACE_TYPE_COLNAME,
                TECHNIQUE_TYPE_COLNAME,
            ] if col in data.columns
        ]

        for variation_point in vp_cols:
            for group_id, group_data in data.groupby([variation_point] +
                                                     group_by_cols_in_dataset):
                best_rank_query = group_data[group_data[RANK_COLNAME] == 1]
                n_datasets_in_query = (
                    1 if DATASET_COLNAME not in best_rank_query.columns else
                    len(best_rank_query[DATASET_COLNAME].unique()))
                vp_freq = n_datasets_in_query / n_datasets
                new_record = {
                    VARIATION_POINT_COLNAME: variation_point,
                    TECHNIQUE_COLNAME: group_id[0],
                    PERCENT_BEST_COLNAME: vp_freq,
                }

                if len(group_id) >= 2:
                    new_record.update(
                        {TRANSITIVE_TRACE_TYPE_COLNAME: group_id[1]})

                if len(group_id) >= 3:
                    new_record.update({TECHNIQUE_TYPE_COLNAME: group_id[2]})

                percent_best_df = percent_best_df.append(new_record,
                                                         ignore_index=True)

        return Table(percent_best_df)
Ejemplo n.º 6
0
    def run(self) -> Table:
        """
        Iterates through
        :return:
        """
        columns = [
            DATASET_NAME,
            DIRECT_PATHS,
            DIRECT_TRACES,
            UPPER_PATHS,
            UPPER_TRACES,
            LOWER_PATHS,
            LOWER_TRACES,
        ]
        data = pd.DataFrame(columns=columns)
        for dataset_name in DATASET_COLUMN_ORDER:
            dataset = Dataset(dataset_name)
            n_top = len(dataset.artifacts[0])
            n_middle = len(dataset.artifacts[1])
            n_bottom = len(dataset.artifacts[2])

            def stat_matrix(matrix):
                n_traces = matrix.sum(axis=1).sum()
                n_paths = matrix.shape[0] * matrix.shape[1]
                return n_paths, n_traces

            d_paths, n_direct_traces = stat_matrix(
                dataset.traced_matrices["0-2"])
            u_paths, n_upper_traces = stat_matrix(
                dataset.traced_matrices["0-1"])
            l_paths, n_lower_traces = stat_matrix(
                dataset.traced_matrices["1-2"])

            entry = {
                DATASET_NAME: dataset_name,
                DIRECT_PATHS: d_paths,
                DIRECT_TRACES: n_direct_traces,
                UPPER_PATHS: u_paths,
                UPPER_TRACES: n_upper_traces,
                LOWER_PATHS: l_paths,
                LOWER_TRACES: n_lower_traces,
            }
            data = data.append(entry, ignore_index=True)
        post_df = data.sort_values(by=DIRECT_TRACES)

        post_df = post_df.round(N_SIG_FIGS)
        post_df.to_csv(EXPORT_PATH, index=False)
        self.export_paths.append(EXPORT_PATH)
        return Table()
Ejemplo n.º 7
0
 def melt_metrics(self,
                  metric_col_name=METRIC_COLNAME,
                  metric_value_col_name="value") -> Table:
     """
     Converts each metric column in table into a row-entry containing all identifying information
     (taken to be all non-metric columns) and the metric score
     :return: Table - containing metric row-entries alongside the identifying information
     """
     metric_found = [
         metric for metric in ALL_METRIC_NAMES
         if metric in self.table.columns
     ]
     other_columns = [
         col for col in self.table.columns if col not in metric_found
     ]
     melted_df = pd.melt(
         self.table,
         id_vars=other_columns,
         value_vars=metric_found,
         var_name=metric_col_name,
         value_name=metric_value_col_name,
     )
     return Table(melted_df)
Ejemplo n.º 8
0
    def test_metric_table(self):
        scoring_table = ScoringTable(self.values[:, 0], self.values[:, 1])
        metrics = calculate_metrics_for_scoring_table(scoring_table,
                                                      self.n_queries, False)

        test_file_name = "test.csv"
        export_path = os.path.join(self.export_path, test_file_name)
        if os.path.exists(export_path):
            os.remove(export_path)

        table = Table(None)
        table.add(metrics)

        # test export
        self.assertFalse(os.path.exists(export_path))
        table.save(export_path)
        self.assertTrue(os.path.exists(export_path))
        df = pd.read_csv(export_path)
        self.assertEqual(1, len(df))
        self.assertEqual(self.expected_lag, df.iloc[0]["lag"])

        os.remove(export_path)
    def run(self) -> Table:
        """
        Returns a metric table containing all of the metrics calculated for each technique in df
        :return: metric table with single query metrics for each technique applied to specified dataset in row
        """
        dataset_name = prompt_for_dataset()

        """
        Find best techniques
        """
        direct_best_definition = get_best_direct_technique(dataset_name)
        transitive_best_definition = get_best_transitive_technique(dataset_name)
        combined_best_definition = get_best_hybrid_technique(dataset_name)

        """
        Calculate metrics for individual queries on dataset
        """
        tracer = Tracer()
        metric_table = MetricTable()

        direct_metrics: [Metrics] = tracer.get_metrics(
            dataset_name, direct_best_definition, summary_metrics=False
        )
        metric_table.add(
            direct_metrics, other={TECHNIQUE_TYPE_COLNAME: DIRECT_ID}, create_index=True
        )

        transitive_metrics: [Metrics] = tracer.get_metrics(
            dataset_name, transitive_best_definition, summary_metrics=False
        )
        metric_table.add(
            transitive_metrics,
            other={TECHNIQUE_TYPE_COLNAME: TRANSITIVE_ID},
            create_index=True,
        )

        combined_metrics: [Metrics] = tracer.get_metrics(
            dataset_name, combined_best_definition, summary_metrics=False
        )
        metric_table.add(
            combined_metrics,
            other={TECHNIQUE_TYPE_COLNAME: HYBRID_ID},
            create_index=True,
        )

        """
        Export individual run
        """
        export_path = os.path.join(PATH_TO_INDIVIDUAL_QUERIES, dataset_name + ".csv")
        (metric_table.sort(DATASET_COLUMN_ORDER).save(export_path))
        self.export_paths.append(export_path)

        """
        Update aggregate
        """

        individual_queries_aggregate = (
            MetricTable(
                Table.aggregate_intermediate_files(PATH_TO_INDIVIDUAL_QUERIES).table
            )
            .create_lag_norm_inverted(drop_old=True)
            .melt_metrics(metric_value_col_name=METRIC_SCORE_COLNAME)
            .sort(DATASET_COLUMN_ORDER)
            .col_values_to_upper(METRIC_COLNAME)
            .to_title_case(exclude=METRIC_COLNAME)
            .save(PATH_TO_INDIVIDUAL_QUERIES_AGG)
        )

        individual_queries_aggregate = (
            MetricTable(
                Table.aggregate_intermediate_files(PATH_TO_INDIVIDUAL_QUERIES).table
            )
            .create_lag_norm_inverted(drop_old=True)
            .sort(DATASET_COLUMN_ORDER)
            .save(PATH_TO_INDIVIDUAL_QUERIES_UNMELTED)
        )

        # aggregate_table
        self.export_paths.append(PATH_TO_INDIVIDUAL_QUERIES_AGG)

        return individual_queries_aggregate