def turn_aggregated_values_into_matrix(dataset: Dataset, values: np.ndarray): """ TODO :param dataset: :param values: :return: """ return np.reshape( values, newshape=( dataset.get_n_artifacts(0), dataset.get_n_artifacts(-1), # TODO: Is this always -1? What about for datasets with 4 levels ), )
def test_cleanup_deletes_on_dataset(self): original_cache_value = Cache.CACHE_ON Cache.CACHE_ON = True dataset_other_name = "SAMPLE_EasyClinic" dataset_other = Dataset(dataset_other_name) tracer = Tracer() tracer.get_metrics(dataset_other_name, self.direct_technique_name) tracer.get_metrics(self.dataset.name, self.direct_technique_name) self.assertTrue( Cache.is_cached(dataset_other, self.get_direct_definition())) self.assertTrue( Cache.is_cached(self.dataset, self.get_direct_definition())) Cache.cleanup(self.dataset.name) self.assertTrue( Cache.is_cached(dataset_other, self.get_direct_definition())) self.assertFalse( Cache.is_cached(self.dataset, self.get_direct_definition())) Cache.cleanup(dataset_other_name) self.assertFalse( Cache.is_cached(dataset_other, self.get_direct_definition())) self.assertFalse( Cache.is_cached(self.dataset, self.get_direct_definition())) Cache.CACHE_ON = original_cache_value
def test_apply_transitive_aggregation_arithmetic(self): dataset = Dataset("MockDataset") similarity_matrices = SimilarityMatrices( dataset.traced_matrices["0-1"], dataset.traced_matrices["1-2"] ) similarity_matrix = apply_transitive_aggregation( similarity_matrices, AggregationMethod.MAX ) self.assertEqual((1, 3), similarity_matrix.shape)
def test_dot_product_with_aggregation_with_fake_dataset(self): dataset = Dataset("MockDataset") similarity_matrices = SimilarityMatrices( dataset.traced_matrices["0-1"], dataset.traced_matrices["1-2"] ) result = dot_product_with_aggregation(similarity_matrices, max) self.assertEqual(1, result[0][0]) self.assertEqual(0, result[0][1]) self.assertEqual(1, result[0][2])
def get_dataset(self, name: str): """ TODO :param name: :return: """ query = list(filter(lambda d: d.name == name, self.datasets)) if len(query) == 0: dataset = Dataset(name) self.datasets.append(dataset) return dataset return query[0]
def test_aggregate_similarity_matrices_with_arithmetic_aggregato_with_fake_dataset( self, ): dataset = Dataset("MockDataset") similarity_matrices = SimilarityMatrices( dataset.traced_matrices["0-1"], dataset.traced_matrices["1-2"] ) self.assertRaises( Exception, lambda: aggregate_similarity_matrices_with_arithmetic_aggregator( similarity_matrices, AggregationMethod.PCA ), )
def run(self) -> Table: """ Iterates through :return: """ columns = [ DATASET_NAME, DIRECT_PATHS, DIRECT_TRACES, UPPER_PATHS, UPPER_TRACES, LOWER_PATHS, LOWER_TRACES, ] data = pd.DataFrame(columns=columns) for dataset_name in DATASET_COLUMN_ORDER: dataset = Dataset(dataset_name) n_top = len(dataset.artifacts[0]) n_middle = len(dataset.artifacts[1]) n_bottom = len(dataset.artifacts[2]) def stat_matrix(matrix): n_traces = matrix.sum(axis=1).sum() n_paths = matrix.shape[0] * matrix.shape[1] return n_paths, n_traces d_paths, n_direct_traces = stat_matrix( dataset.traced_matrices["0-2"]) u_paths, n_upper_traces = stat_matrix( dataset.traced_matrices["0-1"]) l_paths, n_lower_traces = stat_matrix( dataset.traced_matrices["1-2"]) entry = { DATASET_NAME: dataset_name, DIRECT_PATHS: d_paths, DIRECT_TRACES: n_direct_traces, UPPER_PATHS: u_paths, UPPER_TRACES: n_upper_traces, LOWER_PATHS: l_paths, LOWER_TRACES: n_lower_traces, } data = data.append(entry, ignore_index=True) post_df = data.sort_values(by=DIRECT_TRACES) post_df = post_df.round(N_SIG_FIGS) post_df.to_csv(EXPORT_PATH, index=False) self.export_paths.append(EXPORT_PATH) return Table()
def print_highest_ranking_link_in_query( dataset_name: str, technique_data: TransitiveTechniqueData, query_index: int, label_value: int, n_artifact: int, ): dataset = Dataset(dataset_name) oracle_matrix = dataset.traced_matrices["0-2"] ( link_source_index, link_target_index, link_rank, link_score, ) = get_highest_ranking_artifact_pair_indices( oracle_matrix, technique_data.similarity_matrix, query_index, label_value, ) source_id = dataset.artifacts[0].iloc[link_source_index]["id"] target_id = dataset.artifacts[2].iloc[link_target_index]["id"] print(f"Link: {(source_id, target_id)}") print(f"Type:", label_value) print(f"Technique: ", transitive_technique_data.technique.get_name()) print(f"Rank: {link_rank}") print(f"Score: {link_score}") upper_intermediate_values = technique_data.transitive_matrices[0][ link_source_index, :].flatten() lower_intermediate_values = technique_data.transitive_matrices[ 1][:, link_target_index].flatten() intermediate_values = upper_intermediate_values * lower_intermediate_values sorted_values = np.sort(intermediate_values)[::-1] best_intermediate_artifact_indices = np.argsort( intermediate_values)[::-1][:n_artifact] best_intermediate_artifact_ids = list( dataset.artifacts[1].iloc[best_intermediate_artifact_indices]["id"]) print("Most influential intermediate artifacts:", best_intermediate_artifact_ids) print("Intermediate scores:", sorted_values[:n_artifact]) print("Intermediate Sum", sum(intermediate_values)) print("Intermediate Max:", max(intermediate_values))
def print_trace_link_ranks_per_technique( dataset_name: str, similarity_matrices: List[SimilarityMatrix], labels: List[str], query_index: int, ): dataset = Dataset(dataset_name) oracle_matrix = dataset.traced_matrices["0-2"] n_trace_links = sum(oracle_matrix[query_index, :] == 1) print(f"Rankings of trace links in worst performing query") print(f"Trace links in query: {n_trace_links}") for similarity_matrix, label in zip(similarity_matrices, labels): trace_link_ranks = get_ranks_of_trace_links( oracle_matrix, similarity_matrix, query_index, ) print(f"{label}: {trace_link_ranks}")
def create_similarity_scoring_table_from_matrix( dataset: Dataset, source_level: int, target_level: int, similarity_matrix: SimilarityMatrix, ) -> ScoringTable: """ Returns ScoringTable containing the predictions from similarity matrix and oracle values from given dataset :param dataset: dataset containing the oracle values :param source_level: the index of the level that queries are for :param target_level: the index of the level that is being queried against :param similarity_matrix: the predicted values between source and target levels :return: two columns table representing predicted and actual values for queries between source and target levels """ predicted_values = similarity_matrix.flatten() oracle_matrix = dataset.get_oracle_matrix(source_level, target_level) oracle_values = oracle_matrix.flatten() assert len(oracle_values) == len( predicted_values ), "oracle values does not match predicted values" return ScoringTable(predicted_values, oracle_values)
from api.datasets.dataset import Dataset if __name__ == "__main__": dataset_name = "EBT" source_artifact = 151 target_artifact = 68 intermediate_artifacts = [132, 135, 127, 113, 134] dataset = Dataset(dataset_name) for intermediate_artifact in intermediate_artifacts: artifacts = [ (source_artifact, target_artifact, "direct"), (source_artifact, intermediate_artifact, "top"), (intermediate_artifact, target_artifact, "bottom"), ] for source_id, target_id, label in artifacts: source_level_index, source_index = dataset.get_artifact_level_index( source_id) target_level_index, target_index = dataset.get_artifact_level_index( target_id) trace_id = f"{source_level_index}-{target_level_index}" link_id = f"{source_id}-{target_id}" link_value = dataset.traced_matrices[trace_id][source_index, target_index] print(f"{label}:{link_id}:{link_value}") print("")
class TestMetricTable(SmartTest): t_name = "(x (SUM GLOBAL) ((. (VSM NT) (0 1)) (. (VSM NT) (1 2))))" component_a = [".", ["VSM", "NT"], [0, 1]] component_b = [".", ["VSM", "NT"], [1, 2]] technique = TransitiveTechniqueDefinition(["SUM", "GLOBAL"], [component_a, component_b]) d_name = "MockDataset" dataset = Dataset(d_name) export_path = ".." values = np.zeros((3, 2)) # pred values[0, 0] = 0 values[1, 0] = 1 values[2, 0] = 0 # oracle values[0, 1] = 0 values[1, 1] = 1 values[2, 1] = 1 n_queries = 1 expected_map = average_precision_score(values[:, 1], values[:, 0]) expected_auc = calculate_auc(values[:, 1], values[:, 0]) expected_lag = calculate_lag(values[:, 1], values[:, 0]) def test_metric_table(self): scoring_table = ScoringTable(self.values[:, 0], self.values[:, 1]) metrics = calculate_metrics_for_scoring_table(scoring_table, self.n_queries, False) test_file_name = "test.csv" export_path = os.path.join(self.export_path, test_file_name) if os.path.exists(export_path): os.remove(export_path) table = Table(None) table.add(metrics) # test export self.assertFalse(os.path.exists(export_path)) table.save(export_path) self.assertTrue(os.path.exists(export_path)) df = pd.read_csv(export_path) self.assertEqual(1, len(df)) self.assertEqual(self.expected_lag, df.iloc[0]["lag"]) os.remove(export_path) def test_metrics(self): scoring_table = ScoringTable(self.values[:, 0], self.values[:, 1]) query_metrics = calculate_metrics_for_scoring_table( scoring_table, self.n_queries, False) mt = query_metrics[0] self.assertEqual(self.expected_lag, mt.lag, "lag") self.assertEqual(self.expected_map, mt.ap, "map") self.assertEqual(self.expected_auc, mt.auc, "auc")
good_transitive_technique = get_best_transitive_technique( good_dataset_name) tracer = Tracer() technique_data = tracer.get_technique_data(good_dataset_name, good_transitive_technique) metrics = tracer.get_metrics(good_dataset_name, good_transitive_technique, summary_metrics=False) sorted_metrics = sorted(metrics, key=lambda m: m.ap) N_QUERIES = 5 bad_queries = [m.query_id for m in sorted_metrics[:N_QUERIES]] good_queries = [m.query_id for m in sorted_metrics[-N_QUERIES:]] similarity_matrix = minmax_scale(technique_data.similarity_matrix) oracle_matrix = Dataset(good_dataset_name).traced_matrices["0-2"] data = pd.DataFrame() for g_query in good_queries: for col_index in range(similarity_matrix.shape[1]): score_value = similarity_matrix[g_query][col_index] oracle_value = oracle_matrix[g_query][col_index] delta_value = score_value - oracle_value entry = { "query_performance": "top_5", "value": delta_value, "type": "traced" if oracle_value == 1 else "not traced", } data = data.append(entry, ignore_index=True)
class TestTechniqueHelper(SmartTest): d_name = "MockDataset" d_builder = DatasetBuilder(d_name) d_builder.build() d_builder.export() dataset = Dataset(d_name) """ Direct """ direct_algebraic_model = AlgebraicModel.VSM direct_trace_type = TraceType.NOT_TRACED direct_parameters = [direct_algebraic_model.value, direct_trace_type.value] direct_components = ["0", "2"] direct_definition = [ DIRECT_COMMAND_SYMBOL, direct_parameters, direct_components ] """ Intermediate """ transitive_algebraic_model = AlgebraicModel.VSM transitive_aggregation_type = AggregationMethod.SUM transitive_component_scaling_type = ScalingMethod.GLOBAL transitive_component_trace_type = TraceType.NOT_TRACED transitive_component_a = [ DIRECT_COMMAND_SYMBOL, [ transitive_algebraic_model.value, transitive_component_trace_type.value ], ["0", "1"], ] transitive_upper_comp = "(%s (%s %s) (%s %s))" % ( DIRECT_COMMAND_SYMBOL, transitive_algebraic_model.value, transitive_component_trace_type.value, "0", "1", ) transitive_component_b = [ DIRECT_COMMAND_SYMBOL, [ transitive_algebraic_model.value, transitive_component_trace_type.value ], ["1", "2"], ] transitive_component_b_name = "(%s (%s %s) (%s %s))" % ( DIRECT_COMMAND_SYMBOL, transitive_algebraic_model.value, transitive_component_trace_type.value, "1", "2", ) transitive_parameters = [ transitive_aggregation_type.value, transitive_component_scaling_type.value, ] transitive_components = [transitive_component_a, transitive_component_b] transitive_technique_definition = [ TRANSITIVE_COMMAND_SYMBOL, transitive_parameters, transitive_components, ] """ Traced Components """ traced_component_type = TraceType.TRACED traced_aggregation_value = AggregationMethod.MAX traced_direct_component_a = [ DIRECT_COMMAND_SYMBOL, [transitive_algebraic_model.value, traced_component_type.value], ["0", "1"], ] traced_direct_component_b = [ DIRECT_COMMAND_SYMBOL, [transitive_algebraic_model.value, traced_component_type.value], ["1", "2"], ] traced_components = [traced_direct_component_a, traced_direct_component_b] traced_parameters = [ traced_aggregation_value.value, transitive_component_scaling_type.value, ] """ Sampled Artifacts """ sample_percentage = 0.5 sampled_parameters: [str ] = transitive_parameters + [repr(sample_percentage)] sampled_components = transitive_components sampled_artifacts_definition = [ SAMPLED_COMMAND_SYMBOL, sampled_parameters, sampled_components, ] sampled_traces_definition = [ SAMPLED_TRACED_COMMAND_SYMBOL, sampled_parameters, sampled_components, ] """ Combined """ combined_aggregation_type = AggregationMethod.SUM combined_parameters = ["SUM"] combined_components = [direct_definition, transitive_technique_definition] """ Combined (with sampled transitive) """ combined_sampled_artifacts_components = [ direct_definition, sampled_artifacts_definition, ] combined_sampled_traces_components = [ direct_definition, sampled_traces_definition ] direct_technique_name = "(. (VSM NT) (0 2))" transitive_technique_name = ( "(x (SUM GLOBAL) ((. (VSM NT) (0 1)) (. (VSM NT) (1 2))))") transitive_sampled_artifacts_technique_name = ( "(~ (SUM GLOBAL %f) ((. (VSM NT) (0 1)) (. (VSM NT) (1 2))))" % sample_percentage) transitive_sampled_traces_technique_name = ( "($ (SUM GLOBAL %f) ((. (VSM NT) (0 1)) (. (VSM NT) (1 2))))" % sample_percentage) combined_technique_name = "(o (%s) (%s %s))" % ( "SUM", direct_technique_name, transitive_technique_name, ) combined_sampled_artifacts_technique_name = "(o (%s) (%s %s))" % ( "SUM", direct_technique_name, transitive_sampled_artifacts_technique_name, ) combined_sampled_traces_technique_name = "(o (%s) (%s %s))" % ( "SUM", direct_technique_name, transitive_sampled_traces_technique_name, ) def get_direct_definition(self) -> DirectTechniqueDefinition: return DirectTechniqueDefinition(self.direct_parameters, self.direct_components) def get_transitive_definition(self) -> TransitiveTechniqueDefinition: return TransitiveTechniqueDefinition(self.transitive_parameters, self.transitive_components) def get_traced_transitive_definition( self) -> TransitiveTechniqueDefinition: return TransitiveTechniqueDefinition(self.traced_parameters, self.traced_components) def get_combined_definition(self) -> HybridTechniqueDefinition: return HybridTechniqueDefinition(self.combined_parameters, self.combined_components) def get_sampled_technique_definition(self) -> SampledTechniqueDefinition: return SampledTechniqueDefinition(self.sampled_parameters, self.sampled_components) def get_combined_sampled_artifacts_definition( self) -> HybridTechniqueDefinition: return HybridTechniqueDefinition( self.combined_parameters, self.combined_sampled_artifacts_components) def assert_valid_fake_dataset_similarity_matrix( self, similarity_matrix: SimilarityMatrix): self.assertEqual((1, 3), similarity_matrix.shape) def create_counter_func(self, t_name: str): n_function_calls = {"value": 0} def counter_func(data: TechniqueData): self.assertEqual(self.d_name, data.dataset.name) self.assertEqual(t_name, data.technique.get_name()) n_function_calls["value"] = n_function_calls["value"] + 1 return counter_func, n_function_calls
from api.tracer import Tracer DATASET_NAME = "IllustrativeExample" TOP_TECHNIQUE_NAME = "(. (VSM NT) (0 1))" BOTTOM_TECHNIQUE_NAME = "(. (VSM NT) (1 2))" DIRECT_TECHNIQUE_NAME = "(. (VSM NT) (0 2))" TECHNIQUE_NAME = "(x (MAX INDEPENDENT) ((. (VSM NT) (0 1)) (. (VSM NT) (1 2))))" REBUILD = False if __name__ == "__main__": if REBUILD: dataset_builder = DatasetBuilder(DATASET_NAME) dataset_builder.build() dataset_builder.export() dataset = Dataset(DATASET_NAME) tracer = Tracer() top_technique_data = tracer.get_technique_data(DATASET_NAME, TOP_TECHNIQUE_NAME) bottom_technique_data = tracer.get_technique_data(DATASET_NAME, BOTTOM_TECHNIQUE_NAME) direct_technique_data = tracer.get_technique_data(DATASET_NAME, DIRECT_TECHNIQUE_NAME) top_score = top_technique_data.similarity_matrix[0][0] bottom_score = bottom_technique_data.similarity_matrix[0][0] transitive_score = top_score * bottom_score direct_score = direct_technique_data.similarity_matrix[0][0] print("TOP:", top_score)
def test_sampled_transitive_technique_calculator(self): calculator = SampledArtifactsTechniqueCalculator( self.get_sampled_technique_definition()) data = calculator.calculate_technique_data( Dataset("SAMPLE_EasyClinic")) self.assertGreater(data.similarity_matrix.sum(axis=1).sum(), 0)
""" How much was the maximum help the transitive technique provided? """ traced_df = df[df["traced?"] == 1] example_item = traced_df.iloc[traced_df["delta"].argmax()] example_item_idx = example_item["index"] example_score_delta = example_item["delta"] example_technique_scores = ( ("direct", example_item["direct"]), ("transitive", example_item["transitive"]), ("combined", example_item["combined"]), ) """ What artifact pair benefited the most from the transitive technique? """ dataset = Dataset(dataset_name) top_artifacts = dataset.artifacts.artifact_levels[0] intermediate_artifacts = dataset.artifacts.artifact_levels[1] bottom_artifacts = dataset.artifacts.artifact_levels[2] top_artifact_idx = int(example_item_idx // len(bottom_artifacts)) bottom_artifact_idx = int(example_item_idx % len(bottom_artifacts)) top_artifact = top_artifacts.iloc[top_artifact_idx] bottom_artifact = bottom_artifacts.iloc[bottom_artifact_idx] """ What where the top n most beneficial intermediate artifacts? """ upper = transitive_technique_data.transitive_matrices[0] lower = transitive_technique_data.transitive_matrices[1]
matrices = [ tracer.get_technique_data(dataset_name, t).similarity_matrix for t in techniques ] matrices = list(map(minmax_scale, matrices)) def get_group(percentile): if percentile < 1 / 3: return "low" elif percentile < 2 / 3: return "medium" else: return "high" trace_matrix = Dataset(dataset_name).traced_matrices["%s-%s" % (new_path[0], new_path[2])] entries = [] for row_index in range(matrices[0].shape[0]): original_groups = [] for family, matrix in zip(["direct", "transitive", "hybrid"], matrices): query_ranks = pd.Series(matrix[row_index, :]).rank() query_percentiles = 1 - (query_ranks / max(query_ranks)) if len(original_groups) == 0: original_groups = list(map(get_group, query_percentiles)) for col_index in range(matrices[0].shape[1]): trace_value = trace_matrix[row_index, col_index] entries.append({
base_ranks.append(base_rank / n_possible_ranks) target_ranks.append(target_rank / n_possible_ranks) return rank_gains, base_ranks, target_ranks if __name__ == "__main__": # TODO: Move this to be an experiment datasets = ["WARC", "EBT", "EasyClinic", "Drone", "TrainController"] WORD_INTERSECTION_EXPORT_PATH = os.path.join( PATH_TO_PRESENTATION, "word_intersection.csv" ) MAX_N_WORDS = 10 df = pd.DataFrame() for d_name in datasets: d = Dataset(d_name) direct_technique = get_best_direct_technique(d_name) transitive_technique = get_best_transitive_technique(d_name) tracer = Tracer() direct_similarity_matrix = tracer.get_technique_data( d_name, direct_technique ).similarity_matrix transitive_similarity_matrix = tracer.get_technique_data( d_name, transitive_technique ).similarity_matrix for n_intersection_words in range(MAX_N_WORDS): non_intersection_artifact_indices = ( get_artifact_indices_with_word_intersection(d, n_intersection_words) ) if len(non_intersection_artifact_indices) == 0: