Ejemplo n.º 1
0
    def test_load_simulated_data_graph(self):
        simulated_data = TargetData.simulated_data_1()
        self.assertTrue(simulated_data is not None,
                        "No simulated data loaded.")

        dot_str = TargetData.simulated_data_1_graph()
        self.assertTrue(dot_str is not None, "No simulated graph loaded.")
Ejemplo n.º 2
0
 def test_scm_generation(self):
     scm1 = TargetData.scm1()
     scm2 = TargetData.scm2()
     scm3 = TargetData.scm2()
     self.assertTrue(scm1 is not None)
     self.assertTrue(scm2 is not None)
     self.assertTrue(scm3 is not None)
Ejemplo n.º 3
0
    def test_shd(self):
        # get the simulated data
        simulated_data = TargetData.simulated_data_1()
        dot_str = self.pc_util.algo_pc(simulated_data)
        pred_graph = GraphUtil.get_digraph_from_dot(dot_str)

        # get the known data
        dot_str = TargetData.simulated_data_1_graph()
        target_graph = GraphUtil.get_digraph_from_dot(dot_str)
        metrics = GraphMetrics()
        shd = metrics.SHD(target_graph, pred_graph)

        self.assertTrue(shd == 10)
Ejemplo n.º 4
0
    def test_precision_recall(self):
        # get the simulated data
        simulated_data = TargetData.simulated_data_1()
        dot_str = self.pc_util.algo_pc(simulated_data)
        pred_graph = GraphUtil.get_digraph_from_dot(dot_str)

        # get the known data
        dot_str = TargetData.simulated_data_1_graph()
        target_graph = GraphUtil.get_digraph_from_dot(dot_str)
        metrics = GraphMetrics()
        prec_recall = metrics.precision_recall(target_graph, pred_graph)

        self.assertTrue(prec_recall[0] == 0.41250000000000003)
Ejemplo n.º 5
0
 def test_create_known_graph(self):
     dot_str = TargetData.simulated_data_1_graph()
     graph = GraphUtil.get_digraph_from_dot(dot_str)
     self.assertTrue(graph.edges() is not None,
                     "No known simulated graph created.")
     self.assertTrue(graph.nodes() is not None,
                     "No known simulated graph created.")
 def test_randomforest_feature_reduction(self):
     hepart_data = TargetData.hepar2_100_data()
     self.assertTrue(hepart_data is not None, "No data loaded.")
     feature_list = FeatureSelectionRunner.random_forest_feature_reduction(
         hepart_data, 10)
     df_reduced = self.bgmm.get_reduced_dataframe(hepart_data, feature_list)
     self.assertTrue(df_reduced is not None)
 def test_rfe(self):
     hepart_data = TargetData.hepar2_100_data()
     self.assertTrue(hepart_data is not None, "No data loaded.")
     feature_indices = FeatureSelectionRunner.rfe_feature_reduction(
         hepart_data, 10)
     df_reduced = self.bgmm.get_reduced_dataframe(hepart_data,
                                                  feature_indices)
     self.assertTrue(df_reduced is not None)
Ejemplo n.º 8
0
 def test_all_returned_features(self):
     feature_set = set()
     hepart_data = TargetData.hepar2_100_data()
     feature_set.update(FeatureSelectionRunner.random_forest_feature_reduction(hepart_data, 10))
     feature_set.update(FeatureSelectionRunner.pfa_feature_reduction(hepart_data, 10))
     feature_set.update(FeatureSelectionRunner.linear_regression_feature_reduction(hepart_data, 10))
     feature_set.update(FeatureSelectionRunner.xgboost_feature_reduction(hepart_data, 10))
     feature_set.update(FeatureSelectionRunner.rfe_feature_reduction(hepart_data, 10))
     df_reduced = self.bgmm.get_reduced_dataframe(hepart_data, list(feature_set))
     self.assertTrue(df_reduced is not None)
Ejemplo n.º 9
0
 def test_hepar2_graph(self):
     hepar2_dot = TargetData.hepar2_graph()
     self.assertTrue(hepar2_dot is not None)
Ejemplo n.º 10
0
 def test_hepar2_100_data(self):
     hepar2_100_data = TargetData.hepar2_100_data()
     self.assertTrue(hepar2_100_data is not None)
Ejemplo n.º 11
0
 def test_random_scm(self):
     random_scm = TargetData.random_scm1()
     self.assertTrue(random_scm is not None)
Ejemplo n.º 12
0
class App:
    """
    The main AitiaExplorer app entry point.
    """

    algo_runner = AlgorithmRunner()
    feature_selection = FeatureSelectionRunner()
    graph_metrics = GraphMetrics()
    graph_util = GraphUtil()
    data = TargetData()
    bgmm = BayesianGaussianMixtureWrapper()

    def __init__(self):
        self.vm_running = False

    def run_analysis_with_high_low(self,
                                   incoming_df,
                                   target_graph_str=None,
                                   feature_high=None,
                                   feature_low=None,
                                   feature_selection_list=None,
                                   algorithm_list=None,
                                   pc=None,
                                   verbose=True
                                   ):
        """
        Runs the entire analysis with feature selection and causal discovery between a high and low range
        of features. Returns the best results in a separate dataframe.

        :param incoming_df: dataframe
        :param target_graph_str: string in dot format
        :param feature_high: number of features range end
        :param feature_low: number of features range start
        :param feature_selection_list: list of feature selection algorithms
        :param algorithm_list: list of causals discovery algorithms
        :param pc: py-causal object for java vm communication
        :param verbose: verbose boolean
        :return: tuple:
            (AnalysisResults obj,
            best result dataframe,
            target graph (approximated or oetherwise),
            all results dataframe)
        """
        if feature_high is None:
            # just default to number of features in dataframe
            feature_high = len(list(incoming_df))

        if feature_high is None:
            # just default to 1
            feature_low = 1

        if target_graph_str is None:
            # no target graph has been supplied, so let's create an approximation
            # using the hill climbing algorithm
            if verbose:
                print("No target graph has been supplied.")
                print("The system will generate an approximate target graph using the greedy hill climbing algorithm.")
            target_graph_str = self.algo_runner.algo_hill_climber(incoming_df)

        # to store all the results
        results_dict = dict()
        all_results_df = pd.DataFrame()

        # this is just so we can pass it back
        returned_target_graph = None

        for i in range(feature_high, feature_low, -1):

            if verbose:
                print("-----------------------------------------------")
                print("Starting analysis with {0} features...".format(i))

            # get current run results
            result_obj, results_df, returned_target_graph = self.run_analysis(
                incoming_df,
                target_graph_str=target_graph_str,
                n_features=i,
                feature_selection_list=feature_selection_list,
                algorithm_list=algorithm_list,
                pc=pc,
                verbose=verbose)

            results_dict[i] = (result_obj, results_df)
            all_results_df = all_results_df.append(results_df, ignore_index=True)

            if verbose:
                print("Completed analysis with {0} features...".format(i))

        # now we need to figure out the lowest SHD
        shd_results_dict = dict()

        # get the minimum shd for each run
        for k, v in results_dict.items():
            # dict holds a tuple, second value is df
            results_df = v[1]
            minimum_shd = results_df['SHD'].min()
            shd_results_dict[k] = minimum_shd

        # sort the shd
        shd_tuple_list = sorted(shd_results_dict.items(), key=lambda x: x[1])

        # first results are the best, first value in tuple is feature no / index
        i = shd_tuple_list[0][0]

        if verbose:
            print("All done!")
            print("The results with the lowest SHD have been returned.")

        # return the results from the results dict
        # --> results_obj, result_df, target_graph, all_results_df
        return results_dict[i][0], results_dict[i][1], returned_target_graph, all_results_df

    def run_analysis(self,
                     incoming_df,
                     target_graph_str=None,
                     n_features=None,
                     feature_selection_list=None,
                     algorithm_list=None,
                     pc=None,
                     verbose=True):
        """
        Runs the entire analysis with feature selection and causal discovery.
        Takes a specific number of features to return.

        :param incoming_df: dataframe
        :param target_graph_str: string in dot format
        :param n_features: number of features to select (defaults to all if None supplied)
        :param feature_selection_list: list of feature selection algorithms
        :param algorithm_list: list of causals discovery algorithms
        :param pc: py-causal object for java vm communication
        :param verbose: verbose boolean
        :return: tuple:
            (AnalysisResults obj,
            all results dataframe,
            target graph (approximated or oetherwise))
        """

        if n_features is None:
            # just default to number of features in dataframe
            n_features = len(list(incoming_df))

        if target_graph_str is None:
            # no target graph has been supplied, so let's create an approximation
            # using the hill climbing algorithm
            if verbose:
                print("No target graph has been supplied.")
                print("The system will generate an approximate target graph using the greedy hill climbing algorithm.")
            target_graph_str = self.algo_runner.algo_hill_climber(incoming_df)

        feature_selection_list = self._get_feature_selection_algorithms(feature_selection_list)

        amalgamated_analysis_results = []

        for feature_selection in feature_selection_list:
            # get the actual function
            feature_func = feature_selection[1]

            # get the feature list from the function
            features = feature_func(incoming_df, n_features)

            if verbose:
                print("Running causal discovery on features selected by {0}".format(feature_selection[0]))

            # get the reduced dataframe
            df_reduced, requested_features = self.get_reduced_dataframe(incoming_df, features)

            # check to see if this reduced dataframe has introduced unobserved latent edges
            latent_edges = []
            latent_edges.extend(self.algo_runner.algo_miic(df_reduced))

            if verbose:
                print("There are {0} latent edges in the reduced dataset".format(len(latent_edges)))

            analysis_results = self._run_causal_algorithms(df_reduced,
                                                           feature_selection_method=feature_selection[0],
                                                           requested_features=requested_features,
                                                           n_features=n_features,
                                                           target_graph_str=target_graph_str,
                                                           algorithm_list=algorithm_list,
                                                           latent_edges=latent_edges,
                                                           pc=pc,
                                                           verbose=verbose)

            if verbose:
                print("Completed causal discovery on features selected by {0}".format(feature_selection[0]))

            amalgamated_analysis_results.append(analysis_results)

        if verbose:
            print("Completed analysis.")

        # we need to flatten all the results
        amalgamated_list_of_dicts = []
        final_results = []
        for results in amalgamated_analysis_results:
            for result in results.results:
                # append as dict for the dataframe output
                amalgamated_list_of_dicts.append(result.asdict())
                # flatten the results
                final_results.append(result)

        # generate the target graph for the user
        target_graph = self.graph_util.get_causal_graph_from_dot(target_graph_str)

        return final_results, pd.DataFrame(amalgamated_list_of_dicts), target_graph

    def run_causal_discovery(self, df, target_graph_str, algorithm_list, pc):
        """
        Runs the causal discovery.
        :param df: dataframe
        :param target_graph_str: string in dot format
        :param algorithm_list: list of causals discovery algorithms
        :param pc: py-causal object for java vm communication
        :return: tuple:
            (AnalysisResults obj,
            all results dataframe)
        """
        analysis_results = self._run_causal_algorithms(df,
                                                       target_graph_str=target_graph_str,
                                                       algorithm_list=algorithm_list,
                                                       pc=pc)
        return analysis_results, analysis_results.to_dataframe()

    def get_reduced_dataframe(self, incoming_df, feature_indices, sample_with_gmm=False):
        """
        A wrapper call for the BayesianGaussianMixtureWrapper :)
        """
        bgmm = BayesianGaussianMixtureWrapper()
        return bgmm.get_reduced_dataframe(incoming_df, feature_indices, sample_with_gmm)

    def _run_causal_algorithms(self,
                               incoming_df,
                               requested_features=None,
                               feature_selection_method=None,
                               n_features=None,
                               algorithm_list=None,
                               target_graph_str=None,
                               latent_edges=[],
                               pc=None,
                               verbose=True):
        """
        Internal. Runs an analysis on the supplied dataframe.
        This can take a PyCausalWrapper if multiple runs are being done.
        """
        analysis_results = AnalysisResults()
        pc_supplied = True

        # get py-causal if needed
        if pc is None:
            pc_supplied = False
            pc = pycausal()
            pc.start_vm()

        algo_list = self._get_causal_algorithms(algorithm_list)

        for algo in algo_list:
            # dict to store run result
            analysis_result = SingleAnalysisResult()
            analysis_result.feature_selection_method = feature_selection_method
            analysis_result.feature_list = requested_features
            analysis_result.num_features_requested = n_features
            analysis_result.causal_algorithm = algo[0]
            analysis_result.latent_edges = latent_edges

            if verbose:
                print("Running causal discovery using {0}".format(algo[0]))

            # get the graph from the algo
            algo_fn = algo[1]
            dot_str = self._discover_graph(algo_fn, incoming_df, pc)

            # store the dot graph
            analysis_result.dot_format_string = dot_str

            # convert the causal graph
            if dot_str is not None:
                causal_graph = self.graph_util.get_causal_graph_from_dot(dot_str)
                analysis_result.causal_graph = causal_graph
                nx_graph = self.graph_util.get_digraph_from_dot(dot_str)
                analysis_result.causal_graph_with_latent_edges = \
                    self.graph_util.get_causal_graph_with_latent_edges(nx_graph, latent_edges)

            analysis_results.results.append(analysis_result)

        # shutdown the java vm if needed
        if not pc_supplied:
            pc.stop_vm()

        # filter the results
        analysis_results_filtered = self._filter_empty_results(analysis_results)

        # add the causal metrics
        updated_analysis_results = self._add_causal_metrics(analysis_results_filtered, target_graph_str)

        return updated_analysis_results

    def _discover_graph(self, algo_fn, df, pc):
        """
        Siscover the graph using the supplied algorithm function.
        """
        dot_str = algo_fn(df, pc)
        return dot_str

    def _filter_empty_results(self, incoming_results):
        filtered_results = AnalysisResults()
        for result in incoming_results.results:
            if result.causal_graph is not None:
                filtered_results.results.append(result)
        return filtered_results

    def _get_feature_selection_algorithms(self, feature_selection_list):
        """
        Gets the list of feature selection algorithms to run.
        """
        algo_list = feature_selection_list
        if algo_list is None:
            algo_list = self.feature_selection.get_all_feature_selection_algorithms()
        return algo_list

    def _get_causal_algorithms(self, algorithm_list):
        """
        Gets the list of causal algorithms to run.
        """
        algo_list = algorithm_list
        if algo_list is None:
            algo_list = self.algo_runner.get_all_causal_algorithms()
        return algo_list

    def _add_causal_metrics(self, incoming_analysis_results, target_graph_str):
        """
        Provides the causal analysis results.
        """
        return_analysis_results = AnalysisResults()
        target_nxgraph = None
        if target_graph_str is not None:
            target_nxgraph = self.graph_util.get_nxgraph_from_dot(target_graph_str)

        for result in incoming_analysis_results.results:
            if result.dot_format_string is not None \
                    and result.causal_graph is not None:
                pred_graph = self.graph_util.get_nxgraph_from_dot(result.dot_format_string)
                if target_nxgraph is not None:
                    prec_recall = self.graph_metrics.precision_recall(target_nxgraph, pred_graph)[0]
                    shd = self.graph_metrics.SHD(target_nxgraph, pred_graph)
                else:
                    prec_recall = 0
                    shd = 0
                result.AUPRC = prec_recall
                result.SHD = shd
            return_analysis_results.results.append(result)
        return return_analysis_results
Ejemplo n.º 13
0
 def test_retrieve_adjacency_matrix(self):
     dot_str = TargetData.simulated_data_1_graph()
     graph = GraphUtil.get_digraph_from_dot(dot_str)
     metrics = GraphMetrics()
     adj_matrix = metrics.retrieve_adjacency_matrix(graph)
     self.assertTrue(adj_matrix is not None, "No adjacency matrix created.")
Ejemplo n.º 14
0
 def test_algo_miic(self):
     df = TargetData.hepar2_100_data()
     df = df.drop(['fat', 'surgery', 'gallstones'], axis=1)
     latent_edges = AlgorithmRunner.algo_miic(df)
     self.assertTrue(len(latent_edges)==7, "No latent edges returned.")