Esempio n. 1
0
def load_total_cpds():
    # All the nodes in the graph (157 nodes)
    gnodes = total_G.nodes

    data = pd.DataFrame(np.random.randint(low=0,
                                          high=2,
                                          size=(100, len(gnodes))),
                        columns=gnodes)
    # Option 1 of fitting cpds
    estimator = BayesianEstimator(total_G, data)
    p = estimator.get_parameters(prior_type='BDeu', equivalent_sample_size=5)
    for i, cpd in enumerate(p):
        total_G.add_cpds(cpd)

    # Option 2 of fitting cpds
    for i in range(1, num_sub_symptoms + 1):
        cpd_sub = estimator.estimate_cpd('sub_sympt_' + str(i),
                                         prior_type="BDeu")
        total_G.add_cpds(cpd_sub)
        if i <= num_symptoms:
            cpd_symp = estimator.estimate_cpd('sympt_' + str(i),
                                              prior_type="BDeu")
            total_G.add_cpds(cpd_symp)

    # this is the time cruncher.
    for i in range(1, num_conditions + 1):
        cpd_cond = estimator.estimate_cpd('cond_' + str(i), prior_type="BDeu")
        total_G.add_cpds(cpd_cond)
Esempio n. 2
0
    def estimate_parameters(self):
        data = pd.DataFrame(data=self.learning_data)
        sample_size = len(self.learning_data)
        # print(sample_size)
        estimator = BayesianEstimator(self.pgmpy, data)
        # print('data')
        # print('pgmpy node : ', self.pgmpy.nodes())
        # print(self.learning_data)
        # print(data)
        pseudocount = {
            'BENS_0': [1, 2],
            'BENS_1': [1, 2],
            'BENS_2': [1, 2],
            'BENS_3': [1, 2],
            'WORLD_0': [1, 2],
            'WORLD_1': [1, 2],
            'WORLD_2': [1, 2]
        }

        pseudocount = [0.9, 0.9]
        if not 'BENS_1' in self.pgmpy.nodes(
        ) or not 'BENS_2' in self.pgmpy.nodes(
        ) or not 'BENS_3' in self.pgmpy.nodes():
            pseudocount = [0.9, 0.9, 0.9]
        # print('pseudocount :', pseudocount)
        for i, node in enumerate(self.nodes):

            if 'LAN' in node[0] or 'MOTOR' in node[0] or 'WORLD' in node[0]:
                # print('cardinality node ', node[0], ' : ', self.pgmpy.get_cardinality(node[0]))
                # print(self.pgmpy.get_cpds(node[0]).values)
                #self.pgmpy.get_cpds(node[0]).values = estimator.estimate_cpd(node[0], prior_type='dirichlet', pseudo_counts=pseudocount).values
                self.pgmpy.get_cpds(node[0]).values = estimator.estimate_cpd(
                    node[0],
                    prior_type='BDeu',
                    equivalent_sample_size=sample_size).values
Esempio n. 3
0
def distribution(excel_rows, item_name, items, file_name):
    # Using 95% confidence interval
    # (1-0.95)/2
    Z_score = abs(st.norm.ppf(0.025))
    alpha = 1 - 0.95
    data_files = {}

    # create dataframe
    for item in items:
        if item_name == "Monkey":
            df = (monkey_df[(monkey_df.Monkey == item)])
        elif item_name == "gender":
            df = (gender_df[(gender_df.gender == item)])
        z = BayesianEstimator(model, df)
        cat_cpd = z.estimate_cpd('Category',
                                 prior_type="bdeu",
                                 equivalent_sample_size=0)  # .to_factor()
        for condition in conditions:
            for category in categories:
                try:
                    count = list(
                        z.state_counts('Category')
                        [condition].to_dict().values())[0][category]
                    # count = z.state_counts('Category')[condition][category][category]
                    prob = cat_cpd.get_value(**{
                        'Condition': condition,
                        'Category': category
                    })
                    # print(prob)
                    # p_hat and q_hat set to conservative since we have no previous data #0.5 for each
                    # Since its probability I clip to 0
                    lower_ci = max(
                        prob - Z_score * math.sqrt((0.5 * 0.5) / df.shape[0]),
                        0)
                    upper_ci = prob + Z_score * math.sqrt(
                        (0.5 * 0.5) / df.shape[0])
                    if not isNaN(prob) and prob > 0:
                        excel_rows.append([
                            item, condition, category, count, prob, lower_ci,
                            upper_ci, alpha
                        ])
                    else:
                        pass
                        # excel_rows.append([item, left, right, cat, count, prob, 0, 0, 0])
                except KeyError:
                    pass
                    # excel_rows.append([item, left, right, cat, count, 0, 0 , 0, 0])

    prob_df = pd.DataFrame.from_records(excel_rows[1:], columns=excel_rows[0])
    writer = pd.ExcelWriter(file_name + ".xlsx")
    prob_df.to_excel(writer, sheet_name='Distribution')
    prob_df.sort_values('Probability', ascending=True).drop_duplicates(
        [item_name]).to_excel(writer, sheet_name='prefference')
    writer.save()
    return prob_df
Esempio n. 4
0
    def estimate_parameters(self, log=True):
        ''' (5)
        Estimates the parameters of the found network
        '''
        estimator = BayesianEstimator(self.best_model, self.data)
        self.file_writer.write_txt("Number of nodes: " +
                                   str(len(self.variables_names)))
        self.file_writer.write_txt("Complete list: " +
                                   str(self.variables_names))

        for node in self.best_model.nodes():
            cpd = estimator.estimate_cpd(node, prior_type='K2')
            self.best_model.add_cpds(cpd)
            self.log(cpd, log)
            self.file_writer.write_txt(cpd.__str__())
Esempio n. 5
0
    def learn(self, file1, file2):
        f1 = open(file1, encoding="utf8")
        lines = f1.readlines()
        edges = self.getegdes(lines[0])
        data = pd.read_csv(file2)

        G = nx.DiGraph()
        for i in range(int(len(edges) / 2)):
            G.add_edge(edges[2 * i], edges[2 * i + 1])

        est = HillClimbSearch(data, scoring_method=BicScore(data))
        model = est.estimate()
        G_ = nx.DiGraph()
        G_.add_edges_from(model.edges())

        for i, j in G_.edges():
            if i not in G.nodes() or j not in G.nodes():
                G.add_edge(i, j)
            elif not nx.has_path(G, j, i):
                G.add_edge(i, j)

        new_model = BayesianModel()
        new_model.add_edges_from(G.edges)
        G = new_model.copy()

        # N = G.number_of_nodes()
        # B = np.zeros((N*(N-1)//2, N))
        # i = 0
        # y = []
        # k = 0
        # nodes = list(G.nodes._nodes.keys())
        # for i in range(len(nodes)):
        #     for j in range(i+1, len(nodes)):
        #         if nx.has_path(G, nodes[i], nodes[j]):
        #             y.append(1)
        #             B[k, i] = 1
        #             B[k, j] = -1
        #         elif nx.has_path(G, nodes[j], nodes[i]):
        #             y.append(-1)
        #             B[k, i] = 1
        #             B[k, j] = -1
        #         else:
        #             y.append(0)
        #         k += 1
        #
        # W = np.eye(N, N)
        # est = HillClimbSearch(data, scoring_method=BicScore(data))
        # model = est.estimate()
        # G_ = nx.DiGraph()
        # G_.add_edges_from(model.edges())
        # queue = []
        # for node in G_.nodes():
        #     if G_.in_degree(node) == 0:
        #         queue.append(node)
        #         G.node[node]['s'] = N
        #     else:
        #         G.node[node]['s'] = N//2
        # while len(queue)>0:
        #     now = queue[0]
        #     l = list(G_._succ[now].keys())
        #     for i in l:
        #         G.node[i]['s'] = G.node[now]['s'] - 1
        #     queue += l
        #     queue.pop(0)
        #
        # phai = []
        # for node in G.nodes():
        #     phai.append(G.node[node]['s'])
        # miu1 = np.dot(np.transpose(B), B)
        # miu1 = np.linalg.pinv(miu1)
        # miu2 = np.dot(np.transpose(B), y)
        # miu2 = miu2 + phai
        # miu = np.dot(miu1, miu2)
        #
        # seq = miu.tolist()
        # seq = list(zip(seq, nodes))
        # seq = sorted(seq, key=lambda s: s[0])
        # seq = [x[1] for x in seq]

        # nx.draw(G)
        # plt.show()
        estimator = BayesianEstimator(G, data)

        edges = []
        for i in G.edges:
            edges.append(str(i))
        print(edges)
        for i in G.nodes:
            cpd = estimator.estimate_cpd(i, prior_type="K2")
            nodeName = i
            values = dict(data[i].value_counts())
            valueNum = len(values)
            CPT = np.transpose(cpd.values)
            # CPT = cpd.values
            sequence = cpd.variables[1::]
            card = []
            for x in sequence:
                s = len(dict(data[x].value_counts()))
                card.append(s)
            output = nodeName + '\t' + str(valueNum) + '\t' + str(
                CPT.tolist()) + '\t' + str(sequence) + '\t' + str(card)
            print(output)
class Bayes_Net(Core):
    """
    Methods to read_in data and learn the structure and conditional probability tables for
    a Bayesian Network, as well as assessing the strength of the causal influence of endogenous
    variables on the target variable of interest.


    Parameters
    ----------
    target_variable: str, name of the column containing the outcome variable.

    verbose: bool, optional (default = False). Determines whether the user will get verbose status updates.

    random_seed: int, optional.


    Attributes
    ----------
    verbose: boolean
        Whether verbose mode is activated

    target_variable: string
        Name of the target variable in the dataset

    df: pd.DataFrame
        pandas dataframe of input dataset

    structure_algorithm: string
        Name of the learning structure algo that was chosen

    structure_model: pgmpy.base.DAG.DAG
        Learned DAG but without conditional probability tables estimated

    bn_model: pgmpy.models.BayesianModel
        Proper, learned Bayesian Network with conditional probability tables estimated

    odds_ratios: pd.DataFrame
        DataFrame containing odds ratios for all interventions and levels


    Methods
    ----------
    read_data: (self, file_path, **kwargs)
        Reads in dataset using. Essentially a wrapper for pandas' `read_csv` function.

    learn_structure: (self, file_path, algorithm = 'hc')
        Learns the structure of a DAG from data. Saves structure as a CSV to disk.
        Note: this is technically not a bayesian network yet, as we don't have the
        conditional probability tables estimated yet.

    plot_network: (self, file_path, **kwargs)
        Plots the Bayesian Network (highlighting target variable) and saves PNG to disk.

    plot_causal_influence: (self, file_path)
        Uses belief propagation to perform inference and calculates odds ratios for how
        changes in intervention evidence will impact the target variable. A forest plot is
        produced from this.
    """
    def __init__(self, target_variable, random_seed=0, verbose=False):
        self.verbose = verbose
        self.target_variable = target_variable
        self.random_seed = random_seed

        # Validate the params
        self._validate_init_params()

        if self.verbose:
            print("Using the following params for Bayesian Network model:")
            pprint(self.get_params(), indent=4)

    def _validate_init_params(self):
        """
        Very basic checks that the params used when instantiating Bayes_Net look okay
        """
        # Checks for target_variable
        if not isinstance(self.target_variable, str):
            raise TypeError(
                f"target_variable parameter must be a string type, but found type {type(self.target_variable)}"
            )

        # Checks for verbose
        if not isinstance(self.verbose, bool):
            raise TypeError(
                f"verbose parameter must be a boolean type, but found type {type(self.verbose)}"
            )

        # Checks for random_seed
        if not isinstance(self.random_seed, (int, type(None))):
            raise TypeError(
                f"random_seed parameter must be an int, but found type {type(self.random_seed)}"
            )

        if (isinstance(self.random_seed, int)) and self.random_seed < 0:
            raise ValueError(f"random_seed parameter must be > 0")

    def read_data(self, file_path, **kwargs):
        """
        Wrapper for pandas `read_csv` function. Assumes file is CSV with a header row.

        Arguments:
            file_path: str, the absolute file path to the CSV file
            **kwargs: any additional keywords for pandas' `read_csv` function

        Returns:
            None
        """
        self.df = pd.read_csv(filepath_or_buffer=file_path, **kwargs)

        # Check that target variable is in the dataset
        if self.target_variable not in self.df:
            raise ValueError(
                "The target variable you specified isn't in the dataset!")

        if self.verbose:
            print("Successfully read in CSV")

        return None

    def _cramers_v(self, x, y):
        """
        Static method to that calculates Cramers V correlation between two categorical variables
        """
        confusion_matrix = pd.crosstab(x, y)
        chi2 = ss.chi2_contingency(confusion_matrix)[0]

        n = confusion_matrix.sum().sum()
        phi2 = chi2 / n
        r, k = confusion_matrix.shape
        phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1))

        rcorr = r - ((r - 1)**2) / (n - 1)
        kcorr = k - ((k - 1)**2) / (n - 1)

        return np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1)))

    def _initial_filter(self):
        """
        Filters out nodes with zero correlation with target variable
        """

        relevant_vars = []

        for node in self.df.columns:
            if self._cramers_v(self.df[self.target_variable],
                               self.df[node]) > 0:
                relevant_vars.append(node)

        return self.df[relevant_vars]

    def learn_structure(self,
                        file_path,
                        algorithm="hc",
                        significance_level=0.05):
        """
        Employs `pgmpy` package's Bayesian Network structure learning algorithms to learn
        structure from a dataset. Saves a tabular version of the result as a CSV file.

        Arguments:
            algorithm: str, optional (default = 'hc')
                Determines whether the hill-climbing or Peter-Clark are employed.
                Two possible values include: 'hc', 'pc'. Note, I found a bug in pgmpy implementation
                halfway through this project. Don't use the 'pc' method.
            file_path: str, the absolute path to save the file to (e.g. "~/Desktop/BN_structure.csv")
            significance_level: float, option (default = 0.05)
                Statistical significance cutoff for use in pruning the network when using the PC
                algorithm. Lower values produce sparser networks.

        Returns:
            None
        """
        self.structure_algorithm = algorithm

        if self.verbose:
            print(
                "Depending on the number of variables in your dataset, this might take some time..."
            )

        # Learn structure, using one of the algorithms
        np.random.seed(self.random_seed)

        if algorithm == "hc":

            # Filter out columns with zero correlation with target variable
            self.filtered_df = self._initial_filter()

            # Run HC algorithm
            self.structure_model = HillClimbSearch(
                self.filtered_df,
                scoring_method=BicScore(self.filtered_df)).estimate()

            if self.verbose:
                print(
                    f"Structure learned! Saving structure to the following CSV: {file_path}"
                )

            # Eliminate isolated subgraphs
            G = self.structure_model.to_undirected()

            connected_nodes = list(
                nx.algorithms.components.node_connected_component(
                    G, self.target_variable))

            disconnected_nodes = list(
                set(list(self.structure_model.nodes)) - set(connected_nodes))

            for node in disconnected_nodes:
                self.structure_model.remove_node(node)
                self.filtered_df.drop([node], axis=1, inplace=True)

            pd.DataFrame(
                list(self.structure_model.edges),
                columns=["from_variable", "to_variable"],
            ).to_csv(file_path, index=False)

        elif algorithm == "pc":
            self.filtered_df = self.df
            self.structure_model = ConstraintBasedEstimator(
                self.filtered_df).estimate(
                    significance_level=significance_level)

            if self.verbose:
                print(
                    f"Structure learned! Saving structure to the following CSV: {file_path}"
                )

            pd.DataFrame(
                list(self.structure_model.edges),
                columns=["from_variable", "to_variable"],
            ).to_csv(file_path, index=False)

    def plot_network(self, file_path, **kwargs):
        """
        Plots the learned structure, highlighting the target variable.

        Arguments:
            file_path: str, the absolute path to save the file to (e.g. "~/Desktop/plot.png")
            **kwargs: additional keyword arguments for networkx's draw function

        Returns:
            None
        """
        if self.verbose:
            print(
                f"Saving Bayesian Network plot to the following PNG file: {file_path}"
            )

        # Identify target variable so we can highlight it in the plot
        target_index = list(self.structure_model).index(self.target_variable)
        node_size_list = [300] * len(list(self.structure_model.nodes))
        node_color_list = ["#95ABDF"] * len(list(self.structure_model.nodes))
        node_size_list[target_index] = 1500
        node_color_list[target_index] = "#F09A9A"

        # Clear any existing pyplot fig, create plot, and save to disk
        plt.clf()
        nx.draw(
            self.structure_model,
            node_size=node_size_list,
            node_color=node_color_list,
            with_labels=True,
            **kwargs,
        )
        plt.savefig(expanduser(file_path), format="PNG", dpi=300)

    def _estimate_CPT(self):
        """
        Estimates the conditional probability tables associated with each node in the
        Bayesian Network.
        """

        self.bn_model = BayesianModel(list(self.structure_model.edges))
        self.cpt_model = BayesianEstimator(self.bn_model, self.filtered_df)

        for node in list(self.bn_model.nodes):
            self.bn_model.add_cpds(self.cpt_model.estimate_cpd(node))

    def plot_causal_influence(self, file_path):
        """
        Computes the odds of the target variable being value 1 over value 0 (i.e. the odds ratio)
        by iterating through all other network variables/nodes, changing their values,
        and observing how the probability of the target variable changes. Belief propagation
        is used for inference. A forest plot is produced from this and saved to disk.

        Arguments:
            file_path: str, the absolute path to save the file to (e.g. "~/Desktop/forest_plot.png")

        Returns:
            None
        """

        # Estimate CPTs
        self._estimate_CPT()

        if self.verbose:
            print(f"Calculating influence of all nodes on target node")

        if not self.bn_model.check_model():
            print("""
                There is a problem with your network structure. You have disconnected nodes
                or separated sub-networks. Please examine your network plot and re-learn your
                network structure with tweaked settings.
                """)
            return None

        if self.target_variable not in self.bn_model.nodes:
            print("""
                Your target variable has no parent nodes! Can't perform inference! Please examine
                your network plot and re-learn your network structure with tweaked settings.
                """)
            return None

        # Prep for belief propagation
        belief_propagation = BeliefPropagation(self.bn_model)
        belief_propagation.calibrate()

        # Iterate over all intervention nodes and values, calculating odds ratios w.r.t target variable
        overall_dict = {}

        variables_to_test = list(
            set(list(self.bn_model.nodes)) - set(list(self.target_variable)))

        for node in variables_to_test:
            results = []
            for value in self.filtered_df[node].unique():
                prob = belief_propagation.query(
                    variables=[self.target_variable],
                    evidence={
                        node: value
                    },
                    show_progress=False,
                ).values
                results.append([node, value, prob[0], prob[1]])

            results_df = pd.DataFrame(
                results,
                columns=["node", "value", "probability_0", "probability_1"])
            results_df["odds_1"] = (results_df["probability_1"] /
                                    results_df["probability_0"])
            results_df = results_df.sort_values(
                "value", ascending=True, inplace=False).reset_index(drop=True)

            overall_dict[node] = results_df

        final_df_list = []

        for node, temp_df in overall_dict.items():
            first_value = temp_df["odds_1"].iloc[0]
            temp_df["odds_ratio"] = (temp_df["odds_1"] / first_value).round(3)
            final_df_list.append(temp_df)

        final_df = pd.concat(final_df_list)[["node", "value", "odds_ratio"]]
        self.odds_ratios = final_df

        if self.verbose:
            print(f"Saving forest plot to the following PNG file: {file_path}")

        # Clean up the dataframe of odds ratios so plot can have nice labels
        final_df2 = (pd.concat([
            final_df,
            final_df.groupby("node")["value"].apply(
                lambda x: x.shift(-1).iloc[-1]).reset_index(),
        ]).sort_values(by=["node", "value"],
                       ascending=True).reset_index(drop=True))
        final_df2["node"][final_df2["value"].isnull()] = np.nan
        final_df2["value"] = final_df2["value"].astype("Int32").astype(str)
        final_df2["value"].replace({np.nan: ""}, inplace=True)
        final_df3 = final_df2.reset_index(drop=True).reset_index()
        final_df3.rename(columns={"index": "vertical_index"}, inplace=True)
        final_df3["y_label"] = final_df3["node"] + " = " + final_df3["value"]
        final_df3["y_label"][final_df3["odds_ratio"] == 1.0] = (
            final_df3["y_label"] + " (ref)")
        final_df3["y_label"].fillna("", inplace=True)

        # Produce large plot
        plt.clf()
        plt.title(
            "Strength of Associations Between Interventions and Target Variable"
        )
        plt.scatter(
            x=final_df3["odds_ratio"],
            y=final_df3["vertical_index"],
            s=70,
            color="b",
            alpha=0.5,
        )
        plt.xlabel("Odds Ratio")
        plt.axvline(x=1.0, color="red", linewidth="1.5", linestyle="--")
        plt.yticks(final_df3["vertical_index"], final_df3["y_label"])

        for _, row in final_df3.iterrows():
            if not np.isnan(row["odds_ratio"]):
                plt.plot(
                    [0, row["odds_ratio"]],
                    [row["vertical_index"], row["vertical_index"]],
                    color="black",
                    linewidth="0.4",
                )

        plt.xlim([0, final_df3["odds_ratio"].max() + 1])

        figure = plt.gcf()
        figure.set_size_inches(12, 7)

        plt.savefig(expanduser(file_path),
                    bbox_inches="tight",
                    format="PNG",
                    dpi=300)
class TestBayesianEstimator(unittest.TestCase):
    def setUp(self):
        self.m1 = BayesianModel([('A', 'C'), ('B', 'C')])
        self.d1 = pd.DataFrame(data={'A': [0, 0, 1], 'B': [0, 1, 0], 'C': [1, 1, 0]})
        self.d2 = pd.DataFrame(data={'A': [0, 0, 1, 0, 2, 0, 2, 1, 0, 2],
                                     'B': ['X', 'Y', 'X', 'Y', 'X', 'Y', 'X', 'Y', 'X', 'Y'],
                                     'C': [1, 1, 1, 0, 0, 0, 0, 0, 0, 0]})
        self.est1 = BayesianEstimator(self.m1, self.d1)
        self.est2 = BayesianEstimator(self.m1, self.d1, state_names={'A': [0, 1, 2],
                                                                     'B': [0, 1],
                                                                     'C': [0, 1, 23]})
        self.est3 = BayesianEstimator(self.m1, self.d2)

    def test_estimate_cpd_dirichlet(self):
        cpd_A = self.est1.estimate_cpd('A',  prior_type="dirichlet", pseudo_counts=[0, 1])
        self.assertEqual(cpd_A, TabularCPD('A', 2, [[0.5], [0.5]]))

        cpd_B = self.est1.estimate_cpd('B',  prior_type="dirichlet", pseudo_counts=[9, 3])
        self.assertEqual(cpd_B, TabularCPD('B', 2, [[11.0/15], [4.0/15]]))

        cpd_C = self.est1.estimate_cpd('C',  prior_type="dirichlet", pseudo_counts=[0.4, 0.6])
        self.assertEqual(cpd_C, TabularCPD('C', 2, [[0.2, 0.2, 0.7, 0.4],
                                                    [0.8, 0.8, 0.3, 0.6]],
                                           evidence=['A', 'B'], evidence_card=[2, 2]))

    def test_estimate_cpd_improper_prior(self):
        cpd_C = self.est1.estimate_cpd('C',  prior_type="dirichlet", pseudo_counts=[0, 0])
        cpd_C_correct = (TabularCPD('C', 2, [[0.0, 0.0, 1.0, np.NaN],
                                             [1.0, 1.0, 0.0, np.NaN]],
                                    evidence=['A', 'B'], evidence_card=[2, 2],
                                    state_names={'A': [0, 1], 'B': [0, 1], 'C': [0, 1]}))
        # manual comparison because np.NaN != np.NaN
        self.assertTrue(((cpd_C.values == cpd_C_correct.values) |
                         np.isnan(cpd_C.values) & np.isnan(cpd_C_correct.values)).all())

    def test_estimate_cpd_shortcuts(self):
        cpd_C1 = self.est2.estimate_cpd('C',  prior_type='BDeu', equivalent_sample_size=9)
        cpd_C1_correct = TabularCPD('C', 3, [[0.2, 0.2, 0.6, 1./3, 1./3, 1./3],
                                             [0.6, 0.6, 0.2, 1./3, 1./3, 1./3],
                                             [0.2, 0.2, 0.2, 1./3, 1./3, 1./3]],
                                    evidence=['A', 'B'], evidence_card=[3, 2])
        self.assertEqual(cpd_C1, cpd_C1_correct)

        cpd_C2 = self.est3.estimate_cpd('C',  prior_type='K2')
        cpd_C2_correct = TabularCPD('C', 2, [[0.5, 0.6, 1./3, 2./3, 0.75, 2./3],
                                             [0.5, 0.4, 2./3, 1./3, 0.25, 1./3]],
                                    evidence=['A', 'B'], evidence_card=[3, 2])
        self.assertEqual(cpd_C2, cpd_C2_correct)

    def test_get_parameters(self):
        cpds = set([self.est3.estimate_cpd('A'),
                    self.est3.estimate_cpd('B'),
                    self.est3.estimate_cpd('C')])
        self.assertSetEqual(set(self.est3.get_parameters()), cpds)

    def test_get_parameters2(self):
        pseudo_counts = {'A': [1, 2, 3], 'B': [4, 5], 'C': [6, 7]}
        cpds = set([self.est3.estimate_cpd('A', prior_type="dirichlet", pseudo_counts=pseudo_counts['A']),
                    self.est3.estimate_cpd('B', prior_type="dirichlet", pseudo_counts=pseudo_counts['B']),
                    self.est3.estimate_cpd('C', prior_type="dirichlet", pseudo_counts=pseudo_counts['C'])])
        self.assertSetEqual(set(self.est3.get_parameters(prior_type="dirichlet",
                                                         pseudo_counts=pseudo_counts)), cpds)

    def tearDown(self):
        del self.m1
        del self.d1
        del self.d2
        del self.est1
        del self.est2
from pgmpy.estimators import MaximumLikelihoodEstimator

mle = MaximumLikelihoodEstimator(model, data)
print(mle.estimate_cpd("fruit"))  # unconditional
print(" −−−−−−−−−−−−−−−−−−−−−− ")
print(mle.estimate_cpd("tasty"))  # conditional
print(" −−−−−−−−−−−−−−−−−−−−−− ")

# Calibrate all CPDs of ‘model' using MLE:
model.fit(data, estimator=MaximumLikelihoodEstimator)
# Bayesian Parameter Estimation
print("−−−−− Bayesian Parameter Estimation −−−−−−−−−−−−−−")
from pgmpy.estimators import BayesianEstimator

est = BayesianEstimator(model, data)
print(est.estimate_cpd("tasty", prior_type="BDeu", equivalent_sample_size=10))
print(" −−−−−−−−−−−−−−−−−−−−−− ")

# BayesianEstimator , too , can be used via the fit()−method . Full example :
import numpy as np
import pandas as pd
from pgmpy.models import BayesianModel
from pgmpy.estimators import BayesianEstimator
# generate data
data = pd.DataFrame(np.random.randint(low=0, high=2, size=(5000, 4)),
                    columns=["A", "B", "C", "D"])
model = BayesianModel([("A", "B"), ("A", "C"), ("D", "C"), ("B", "D")])
model.fit(data, estimator=BayesianEstimator, prior_type="BDeu"
          )  # d e f a u l t e q u i v a l e n t s a m p l e s i z e =5
for cpd in model.get_cpds():
    print(cpd)
class TestBayesianEstimator(unittest.TestCase):
    def setUp(self):
        self.m1 = BayesianModel([('A', 'C'), ('B', 'C')])
        self.d1 = pd.DataFrame(data={
            'A': [0, 0, 1],
            'B': [0, 1, 0],
            'C': [1, 1, 0]
        })
        self.d2 = pd.DataFrame(
            data={
                'A': [0, 0, 1, 0, 2, 0, 2, 1, 0, 2],
                'B': ['X', 'Y', 'X', 'Y', 'X', 'Y', 'X', 'Y', 'X', 'Y'],
                'C': [1, 1, 1, 0, 0, 0, 0, 0, 0, 0]
            })
        self.est1 = BayesianEstimator(self.m1, self.d1)
        self.est2 = BayesianEstimator(self.m1,
                                      self.d1,
                                      state_names={
                                          'A': [0, 1, 2],
                                          'B': [0, 1],
                                          'C': [0, 1, 23]
                                      })
        self.est3 = BayesianEstimator(self.m1, self.d2)

    def test_estimate_cpd_dirichlet(self):
        cpd_A = self.est1.estimate_cpd('A',
                                       prior_type="dirichlet",
                                       pseudo_counts=[[0], [1]])
        self.assertEqual(cpd_A, TabularCPD('A', 2, [[0.5], [0.5]]))

        cpd_B = self.est1.estimate_cpd('B',
                                       prior_type="dirichlet",
                                       pseudo_counts=[[9], [3]])
        self.assertEqual(cpd_B, TabularCPD('B', 2, [[11.0 / 15], [4.0 / 15]]))

        cpd_C = self.est1.estimate_cpd('C',
                                       prior_type="dirichlet",
                                       pseudo_counts=[[0.4, 0.4, 0.4, 0.4],
                                                      [0.6, 0.6, 0.6, 0.6]])
        self.assertEqual(
            cpd_C,
            TabularCPD('C',
                       2, [[0.2, 0.2, 0.7, 0.4], [0.8, 0.8, 0.3, 0.6]],
                       evidence=['A', 'B'],
                       evidence_card=[2, 2]))

    def test_estimate_cpd_improper_prior(self):
        cpd_C = self.est1.estimate_cpd('C',
                                       prior_type="dirichlet",
                                       pseudo_counts=[[0, 0, 0, 0],
                                                      [0, 0, 0, 0]])
        cpd_C_correct = (TabularCPD(
            'C',
            2, [[0.0, 0.0, 1.0, np.NaN], [1.0, 1.0, 0.0, np.NaN]],
            evidence=['A', 'B'],
            evidence_card=[2, 2],
            state_names={
                'A': [0, 1],
                'B': [0, 1],
                'C': [0, 1]
            }))
        # manual comparison because np.NaN != np.NaN
        self.assertTrue(
            ((cpd_C.values == cpd_C_correct.values)
             | np.isnan(cpd_C.values) & np.isnan(cpd_C_correct.values)).all())

    def test_estimate_cpd_shortcuts(self):
        cpd_C1 = self.est2.estimate_cpd('C',
                                        prior_type='BDeu',
                                        equivalent_sample_size=9)
        cpd_C1_correct = TabularCPD('C',
                                    3,
                                    [[0.2, 0.2, 0.6, 1. / 3, 1. / 3, 1. / 3],
                                     [0.6, 0.6, 0.2, 1. / 3, 1. / 3, 1. / 3],
                                     [0.2, 0.2, 0.2, 1. / 3, 1. / 3, 1. / 3]],
                                    evidence=['A', 'B'],
                                    evidence_card=[3, 2])
        self.assertEqual(cpd_C1, cpd_C1_correct)

        cpd_C2 = self.est3.estimate_cpd('C', prior_type='K2')
        cpd_C2_correct = TabularCPD('C',
                                    2,
                                    [[0.5, 0.6, 1. / 3, 2. / 3, 0.75, 2. / 3],
                                     [0.5, 0.4, 2. / 3, 1. / 3, 0.25, 1. / 3]],
                                    evidence=['A', 'B'],
                                    evidence_card=[3, 2])
        self.assertEqual(cpd_C2, cpd_C2_correct)

    def test_get_parameters(self):
        cpds = set([
            self.est3.estimate_cpd('A'),
            self.est3.estimate_cpd('B'),
            self.est3.estimate_cpd('C')
        ])
        self.assertSetEqual(set(self.est3.get_parameters()), cpds)

    def test_get_parameters2(self):
        pseudo_counts = {
            'A': [[1], [2], [3]],
            'B': [[4], [5]],
            'C': [[6, 6, 6, 6, 6, 6], [7, 7, 7, 7, 7, 7]]
        }
        cpds = set([
            self.est3.estimate_cpd('A',
                                   prior_type="dirichlet",
                                   pseudo_counts=pseudo_counts['A']),
            self.est3.estimate_cpd('B',
                                   prior_type="dirichlet",
                                   pseudo_counts=pseudo_counts['B']),
            self.est3.estimate_cpd('C',
                                   prior_type="dirichlet",
                                   pseudo_counts=pseudo_counts['C'])
        ])
        self.assertSetEqual(
            set(
                self.est3.get_parameters(prior_type="dirichlet",
                                         pseudo_counts=pseudo_counts)), cpds)

    def tearDown(self):
        del self.m1
        del self.d1
        del self.d2
        del self.est1
        del self.est2
    return links

links = CreateLinks(data_columns)
model = BayesianModel(links)

pe = ParameterEstimator(model, data)

# Print ParameterEstimator unconditional
pe_symptom1 = pe.state_counts('Symptom_1')
print(pe_symptom1)

# Print ParameterEstimator conditional disease
pe_disease = pe.state_counts('Disease')
print(pe_disease)

mle = MaximumLikelihoodEstimator(model, data)

# Print MaximumLikelihoodEstimator unconditional
mle_symptom1 = mle.estimate_cpd('Symptom_1')
print(mle_symptom1)

# Print MaximumLikelihoodEstimator conditional
#mle_disease = mle.estimate_cpd('Disease')
#print(mle_disease)

# Calibrate all CPDs of `model` using MLE:
model.fit(data, estimator=MaximumLikelihoodEstimator)

est = BayesianEstimator(model, data)
est_disease = est.estimate_cpd('Disease', prior_type='BDeu', equivalent_sample_size=10)
print(est_disease)
Esempio n. 11
0
def task4():
	global andRawData, task4_best_bm
	k2Scores = []
	andRawData_temp = pd.DataFrame(andRawData.values, columns=['f1','f2','f3','f4','f5','f6','f7','f8','f9'])
	#Model 1
	est = HillClimbSearch(andRawData_temp, scoring_method=K2Score(andRawData_temp))
	model_temp = est.estimate()
	estimator = BayesianEstimator(model_temp, andRawData_temp)
	for fx in ['f1','f2','f3','f4','f5','f6','f7','f8','f9']:
		cpd_fx = estimator.estimate_cpd(fx, prior_type="K2")
		model_temp.add_cpds(cpd_fx)
	task4_bms.append(model_temp)
	print("	Model 1: Model through HillClimbSearch is : "+str(model_temp.edges()))
	k2Score = K2Score((BayesianModelSampling(model_temp)).forward_sample(size=1000))
	k2Scores_temp = k2Score.score(model_temp)
	k2Scores.append(k2Scores_temp)
	print("	Model 1: K2 Accuracy Score is "+str(k2Scores_temp))
	#Model 2: Manual Model based on HillClimbSearch
	model_temp = BayesianModel([('f3', 'f4'), ('f4', 'f9'), ('f3', 'f8'), ('f1', 'f7'), ('f5', 'f3'), ('f9', 'f8'), ('f1', 'f6'), ('f9', 'f1'), ('f9', 'f6'), ('f9', 'f2')])
	estimator = BayesianEstimator(model_temp, andRawData_temp)
	for fx in ['f1','f2','f3','f4','f5','f6','f7','f8','f9']:
		cpd_fx = estimator.estimate_cpd(fx, prior_type="K2")
		model_temp.add_cpds(cpd_fx)
	task4_bms.append(model_temp)
	print("	Model 2: Manual Model based on HillClimbSearch is : "+str(model_temp.edges()))
	k2Score = K2Score((BayesianModelSampling(model_temp)).forward_sample(size=1000))
	k2Scores_temp = k2Score.score(model_temp)
	k2Scores.append(k2Scores_temp)
	print("	Model 2: K2 Accuracy Score is "+str(k2Scores_temp))
	#Model 3: Manual Model based on HillClimbSearch
	model_temp = BayesianModel([('f3', 'f4'), ('f4', 'f9'), ('f3', 'f8'), ('f5', 'f7'), ('f5', 'f3'), ('f9', 'f8'), ('f1', 'f2'), ('f9', 'f1'), ('f9', 'f6'), ('f9', 'f2')])
	estimator = BayesianEstimator(model_temp, andRawData_temp)
	for fx in ['f1','f2','f3','f4','f5','f6','f7','f8','f9']:
		cpd_fx = estimator.estimate_cpd(fx, prior_type="K2")
		model_temp.add_cpds(cpd_fx)
	task4_bms.append(model_temp)
	print("	Model 3: Manual Model based on HillClimbSearch is : "+str(model_temp.edges()))
	k2Score = K2Score((BayesianModelSampling(model_temp)).forward_sample(size=1000))
	k2Scores_temp = k2Score.score(model_temp)
	k2Scores.append(k2Scores_temp)
	print("	Model 3: K2 Accuracy Score is "+str(k2Scores_temp))
	#Model 4: Manual Model based on HillClimbSearch
	model_temp = BayesianModel([('f3', 'f4'), ('f4', 'f9'), ('f5', 'f7'), ('f5', 'f3'), ('f1', 'f2'), ('f9', 'f1'), ('f9', 'f6'), ('f9', 'f8'),])
	estimator = BayesianEstimator(model_temp, andRawData_temp)
	for fx in ['f1','f2','f3','f4','f5','f6','f7','f8','f9']:
		cpd_fx = estimator.estimate_cpd(fx, prior_type="K2")
		model_temp.add_cpds(cpd_fx)
	task4_bms.append(model_temp)
	print("	Model 4: Manual Model based on HillClimbSearch is : "+str(model_temp.edges()))
	k2Score = K2Score((BayesianModelSampling(model_temp)).forward_sample(size=1000))
	k2Scores_temp = k2Score.score(model_temp)
	k2Scores.append(k2Scores_temp)
	print("	Model 4: K2 Accuracy Score is "+str(k2Scores_temp))
	#Model 5: Manual Model based on Intuition
	model_temp = BayesianModel([('f3', 'f4'), ('f4', 'f9'), ('f4', 'f7'), ('f1', 'f2'), ('f8', 'f5'), ('f9', 'f6'), ('f9', 'f8')])
	estimator = BayesianEstimator(model_temp, andRawData_temp)
	for fx in ['f1','f2','f3','f4','f5','f6','f7','f8','f9']:
		cpd_fx = estimator.estimate_cpd(fx, prior_type="K2")
		model_temp.add_cpds(cpd_fx)
	task4_bms.append(model_temp)
	print("	Model 5: Manual Model based on HillClimbSearch is : "+str(model_temp.edges()))
	k2Score = K2Score((BayesianModelSampling(model_temp)).forward_sample(size=1000))
	k2Scores_temp = k2Score.score(model_temp)
	k2Scores.append(k2Scores_temp)
	print("	Model 5: K2 Accuracy Score is "+str(k2Scores_temp))
	task4_best_bm = task4_bms[k2Scores.index(max(k2Scores))]
	print("	Best Bayesian Model with the highest accuracy score is thus Model "+str(1+k2Scores.index(max(k2Scores))))
print(best_model.edges())
# la relecture de la structure trouvée révèle que le programme donne les liaisons mais pas le sens de ces dernières.
# le model avec le bon sens serait donc :
bon_model = BayesianModel([('Cancer', 'TbOuCa'), ('TbOuCa', 'Dyspnea'),
                           ('TbOuCa', 'Bronchite'), ('TbOuCa', 'Radiographie'),
                           ('Fumeur', 'Bronchite'),
                           ('Radiographie', 'Dyspnea'),
                           ('Tuberculose', 'TbOuCa'),
                           ('Bronchite', 'Dyspnea')])

#apprentissage des paramètres
#print("estimation des cpds :")
from pgmpy.estimators import BayesianEstimator

est = BayesianEstimator(best_model, data)
print(est.estimate_cpd('Cancer', prior_type='BDeu', equivalent_sample_size=10))

best_model.fit(data, estimator=BayesianEstimator, prior_type='BDeu')
#for cpd in best_model.get_cpds():
#	print(cpd)

#Caractéristique des personnes ayant un cancer
model_infer = VariableElimination(best_model)
q = model_infer.query(variables=[
    'Age', 'Fumeur', 'Tuberculose', 'VisiteAsie', 'Radiographie', 'Bronchite',
    'Dyspnea', 'Geographie', 'TbOuCa'
],
                      evidence={'Cancer': 2})  # 0 = ? , 1=False, 2=True
print("Caratéristiques des personnes ayant le cancer :")
#print(q['Age'])
print(q['Fumeur'])
Esempio n. 13
0
def main():
	#Fetching features data
	features_data = pd.read_csv(fileloc_features)
	features_data_f = features_data.add_prefix('f')
	features_data_g = features_data.add_prefix('g')
	#Seen Training Data
	seen_traindata = pd.read_csv(fileloc_seen_training, usecols = ['left','right','label'])
	#seen_traindata_f = pd.read_csv(fileloc_seen_training, usecols = ['left','label'])
	#seen_traindata_g = pd.read_csv(fileloc_seen_training, usecols = ['right','label'])
	seen_traindata_merged_f = seen_traindata.merge(features_data_f, left_on = 'left', right_on = 'fimagename')
	seen_traindata_merged_g = seen_traindata.merge(features_data_g, left_on = 'right', right_on = 'gimagename')
	seen_traindata_merged_f = seen_traindata_merged_f.drop(['left', 'right','fimagename','label'], axis = 1)
	seen_traindata_merged_g = seen_traindata_merged_g.drop(['left', 'right','gimagename','label'], axis = 1)
	seen_features_traindata_final = pd.concat([seen_traindata_merged_f, seen_traindata_merged_g], axis = 1)
	seen_label_traindata_final = seen_traindata.loc[:, 'label']
	seen_traindata_final = pd.concat([seen_features_traindata_final, seen_label_traindata_final], axis = 1)
	seen_traindata_final.replace([np.inf, -np.inf], np.nan)
	seen_traindata_final.dropna(inplace=True)  
	seen_traindata_final = seen_traindata_final.astype(int)
	seen_traindata_final_NDArray = seen_traindata_final.values 
	#Seen Validation Data
	seen_validationdata = pd.read_csv(fileloc_seen_validation, usecols = ['left','right','label'])
	#seen_validationdata_f = pd.read_csv(fileloc_seen_validation, usecols = ['left','label'])
	#seen_validationdata_g = pd.read_csv(fileloc_seen_validation, usecols = ['right','label'])
	seen_validationdata_merged_f = seen_validationdata.merge(features_data_f, left_on = 'left', right_on = 'fimagename')
	seen_validationdata_merged_g = seen_validationdata.merge(features_data_g, left_on = 'right', right_on = 'gimagename')
	seen_validationdata_merged_f = seen_validationdata_merged_f.drop(['left', 'right','fimagename','label'], axis = 1)
	seen_validationdata_merged_g = seen_validationdata_merged_g.drop(['left', 'right','gimagename','label'], axis = 1)
	seen_features_validationdata_final = pd.concat([seen_validationdata_merged_f, seen_validationdata_merged_g], axis = 1)
	seen_label_validationdata_final = seen_validationdata.loc[:, 'label']
	seen_validationdata_final = pd.concat([seen_features_validationdata_final, seen_label_validationdata_final], axis = 1)
	seen_validationdata_final.replace([np.inf, -np.inf], np.nan)
	seen_validationdata_final.dropna(inplace=True)
	seen_validationdata_final = seen_validationdata_final.astype(int)
	seen_validationdata_final_NDArray = seen_validationdata_final.values
	#Shuffled Training Data
	shuffled_traindata = pd.read_csv(fileloc_shuffled_training, usecols = ['left','right','label'])
	#shuffled_traindata_f = pd.read_csv(fileloc_shuffled_training, usecols = ['left','label'])
	#shuffled_traindata_g = pd.read_csv(fileloc_shuffled_training, usecols = ['right','label'])
	shuffled_traindata_merged_f = shuffled_traindata.merge(features_data_f, left_on = 'left', right_on = 'fimagename')
	shuffled_traindata_merged_g = shuffled_traindata.merge(features_data_g, left_on = 'right', right_on = 'gimagename')
	shuffled_traindata_merged_f = shuffled_traindata_merged_f.drop(['left', 'right','fimagename','label'], axis = 1)
	shuffled_traindata_merged_g = shuffled_traindata_merged_g.drop(['left', 'right','gimagename','label'], axis = 1)
	shuffled_features_traindata_final = pd.concat([shuffled_traindata_merged_f, shuffled_traindata_merged_g], axis = 1)
	shuffled_label_traindata_final = shuffled_traindata.loc[:, 'label']
	shuffled_traindata_final = pd.concat([shuffled_features_traindata_final, shuffled_label_traindata_final], axis = 1)
	shuffled_traindata_final.replace([np.inf, -np.inf], np.nan)
	shuffled_traindata_final.dropna(inplace=True)
	shuffled_traindata_final = shuffled_traindata_final.astype(int)
	shuffled_traindata_final_NDArray = shuffled_traindata_final.values
	#Shuffled Validation Data
	shuffled_validationdata = pd.read_csv(fileloc_shuffled_validation, usecols = ['left','right','label'])
	#shuffled_validationdata_f = pd.read_csv(fileloc_shuffled_validation, usecols = ['left','label'])
	#shuffled_validationdata_g = pd.read_csv(fileloc_shuffled_validation, usecols = ['right','label'])
	shuffled_validationdata_merged_f = shuffled_validationdata.merge(features_data_f, left_on = 'left', right_on = 'fimagename')
	shuffled_validationdata_merged_g = shuffled_validationdata.merge(features_data_g, left_on = 'right', right_on = 'gimagename')
	shuffled_validationdata_merged_f = shuffled_validationdata_merged_f.drop(['left', 'right','fimagename','label'], axis = 1)
	shuffled_validationdata_merged_g = shuffled_validationdata_merged_g.drop(['left', 'right','gimagename','label'], axis = 1)
	shuffled_features_validationdata_final = pd.concat([shuffled_validationdata_merged_f, shuffled_validationdata_merged_g], axis = 1)
	shuffled_label_validationdata_final = shuffled_validationdata.loc[:, 'label']
	shuffled_validationdata_final = pd.concat([shuffled_features_validationdata_final, shuffled_label_validationdata_final], axis = 1)
	shuffled_validationdata_final.replace([np.inf, -np.inf], np.nan)
	shuffled_validationdata_final.dropna(inplace=True)
	shuffled_validationdata_final = shuffled_validationdata_final.astype(int)
	shuffled_validationdata_final_NDArray = shuffled_validationdata_final.values
	#Unseen Training Data
	unseen_traindata = pd.read_csv(fileloc_unseen_training, usecols = ['left','right','label'])
	#unseen_traindata_f = pd.read_csv(fileloc_unseen_training, usecols = ['left','label'])
	#unseen_traindata_g = pd.read_csv(fileloc_unseen_training, usecols = ['right','label'])
	unseen_traindata_merged_f = unseen_traindata.merge(features_data_f, left_on = 'left', right_on = 'fimagename')
	unseen_traindata_merged_g = unseen_traindata.merge(features_data_g, left_on = 'right', right_on = 'gimagename')
	unseen_traindata_merged_f = unseen_traindata_merged_f.drop(['left', 'right','fimagename','label'], axis = 1)
	unseen_traindata_merged_g = unseen_traindata_merged_g.drop(['left', 'right','gimagename','label'], axis = 1)
	unseen_features_traindata_final = pd.concat([unseen_traindata_merged_f, unseen_traindata_merged_g], axis = 1)
	unseen_label_traindata_final = unseen_traindata.loc[:, 'label']
	unseen_traindata_final = pd.concat([unseen_features_traindata_final, unseen_label_traindata_final], axis = 1)
	unseen_traindata_final.replace([np.inf, -np.inf], np.nan)
	unseen_traindata_final.dropna(inplace=True)
	unseen_traindata_final = unseen_traindata_final.astype(int)
	unseen_traindata_final_NDArray = unseen_traindata_final.values
	#Unseen Validation Data
	unseen_validationdata = pd.read_csv(fileloc_unseen_validation, usecols = ['left','right','label'])
	#unseen_validationdata_f = pd.read_csv(fileloc_unseen_validation, usecols = ['left','label'])
	#unseen_validationdata_g = pd.read_csv(fileloc_unseen_validation, usecols = ['right','label'])
	unseen_validationdata_merged_f = unseen_validationdata.merge(features_data_f, left_on = 'left', right_on = 'fimagename')
	unseen_validationdata_merged_g = unseen_validationdata.merge(features_data_g, left_on = 'right', right_on = 'gimagename')
	unseen_validationdata_merged_f = unseen_validationdata_merged_f.drop(['left', 'right','fimagename','label'], axis = 1)
	unseen_validationdata_merged_g = unseen_validationdata_merged_g.drop(['left', 'right','gimagename','label'], axis = 1)
	unseen_features_validationdata_final = pd.concat([unseen_validationdata_merged_f, unseen_validationdata_merged_g], axis = 1)
	unseen_label_validationdata_final = unseen_validationdata.loc[:, 'label']
	unseen_validationdata_final = pd.concat([unseen_features_validationdata_final, unseen_label_validationdata_final], axis = 1)
	unseen_validationdata_final.replace([np.inf, -np.inf], np.nan)
	unseen_validationdata_final.dropna(inplace=True)
	unseen_validationdata_final = unseen_validationdata_final.astype(int)
	unseen_validationdata_final_NDArray = unseen_validationdata_final.values
	#Creating base models
	featureNamesList = ["pen_pressure","letter_spacing","size","dimension","is_lowercase","is_continuous","slantness","tilt","entry_stroke_a", "staff_of_a","formation_n","staff_of_d","exit_stroke_d","word_formation","constancy"]
	features_only_data = features_data[featureNamesList]
	initial_hcs = HillClimbSearch(features_only_data)
	initial_model = initial_hcs.estimate()
	#print(initial_model.edges())
	print("Hill Climb Done")
	basemodel = BayesianModel([('fpen_pressure', 'fis_lowercase'), ('fpen_pressure', 'fletter_spacing'), ('fsize', 'fslantness'), ('fsize', 'fpen_pressure'), 
								('fsize', 'fstaff_of_d'), ('fsize', 'fletter_spacing'), ('fsize', 'fexit_stroke_d'), ('fsize', 'fentry_stroke_a'), 
								('fdimension', 'fsize'), ('fdimension', 'fis_continuous'), ('fdimension', 'fslantness'), ('fdimension', 'fpen_pressure'), 
								('fis_lowercase', 'fstaff_of_a'), ('fis_lowercase', 'fexit_stroke_d'), ('fis_continuous', 'fexit_stroke_d'), ('fis_continuous', 'fletter_spacing'), 
								('fis_continuous', 'fentry_stroke_a'), ('fis_continuous', 'fstaff_of_a'), ('fis_continuous', 'fis_lowercase'), ('fslantness', 'fis_continuous'), 
								('fslantness', 'ftilt'), ('fentry_stroke_a', 'fpen_pressure'), ('fformation_n', 'fconstancy'), ('fformation_n', 'fword_formation'), ('fformation_n', 'fdimension'), 
								('fformation_n', 'fstaff_of_d'), ('fformation_n', 'fis_continuous'), ('fformation_n', 'fsize'), ('fformation_n', 'fstaff_of_a'), ('fstaff_of_d', 'fis_continuous'), 
								('fstaff_of_d', 'fexit_stroke_d'), ('fstaff_of_d', 'fis_lowercase'), ('fstaff_of_d', 'fslantness'), ('fstaff_of_d', 'fentry_stroke_a'), 
								('fword_formation', 'fdimension'), ('fword_formation', 'fstaff_of_a'), ('fword_formation', 'fsize'), ('fword_formation', 'fstaff_of_d'), 
								('fword_formation', 'fconstancy'), ('fconstancy', 'fstaff_of_a'), ('fconstancy', 'fletter_spacing'), ('fconstancy', 'fdimension'), 
								('gpen_pressure', 'gis_lowercase'), ('gpen_pressure', 'gletter_spacing'), ('gsize', 'gslantness'), ('gsize', 'gpen_pressure'), 
								('gsize', 'gstaff_of_d'), ('gsize', 'gletter_spacing'), ('gsize', 'gexit_stroke_d'), ('gsize', 'gentry_stroke_a'), ('gdimension', 'gsize'), 
								('gdimension', 'gis_continuous'), ('gdimension', 'gslantness'), ('gdimension', 'gpen_pressure'), ('gis_lowercase', 'gstaff_of_a'), 
								('gis_lowercase', 'gexit_stroke_d'), ('gis_continuous', 'gexit_stroke_d'), ('gis_continuous', 'gletter_spacing'), ('gis_continuous', 'gentry_stroke_a'), 
								('gis_continuous', 'gstaff_of_a'), ('gis_continuous', 'gis_lowercase'), ('gslantness', 'gis_continuous'), ('gslantness', 'gtilt'), 
								('gentry_stroke_a', 'gpen_pressure'), ('gformation_n', 'gconstancy'), ('gformation_n', 'gword_formation'), ('gformation_n', 'gdimension'), 
								('gformation_n', 'gstaff_of_d'), ('gformation_n', 'gis_continuous'), ('gformation_n', 'gsize'), ('gformation_n', 'gstaff_of_a'), ('gstaff_of_d', 'gis_continuous'), 
								('gstaff_of_d', 'gexit_stroke_d'), ('gstaff_of_d', 'gis_lowercase'), ('gstaff_of_d', 'gslantness'), ('gstaff_of_d', 'gentry_stroke_a'), 
								('gword_formation', 'gdimension'), ('gword_formation', 'gstaff_of_a'), ('gword_formation', 'gsize'), ('gword_formation', 'gstaff_of_d'), 
								('gword_formation', 'gconstancy'), ('gconstancy', 'gstaff_of_a'), ('gconstancy', 'gletter_spacing'), ('gconstancy', 'gdimension'),
								('fis_continuous', 'label'), ('fword_formation','label'),
								('gis_continuous', 'label'), ('gword_formation','label')])
	model_seen = basemodel.copy()
	model_shuffled = basemodel.copy()
	model_unseen = basemodel.copy()
	accuracies = {}
	#Training Seen Model
	model_seen.fit(seen_traindata_final)
	estimator_seen = BayesianEstimator(model_seen, seen_traindata_final)
	cpds=[]
	for featureName in featureNamesList :
		cpd = estimator_seen.estimate_cpd('f'+featureName)
		cpds.append(cpd)
		cpd = estimator_seen.estimate_cpd('g'+featureName)
		cpds.append(cpd)
	cpd = estimator_seen.estimate_cpd('label')
	cpds.append(cpd)
	model_seen.add_cpds(*cpds)
	print("CPDs Calculated")
	#Testing Seen Model - Training
	model_seen_ve = VariableElimination(model_seen)
	model_seen_traindata_predictions = []
	for i in range(seen_traindata_final_NDArray.shape[0]):
		evidenceDic = {}
		for index, featureName in enumerate(featureNamesList): 
			evidenceDic['f'+featureName]=(seen_traindata_final_NDArray[i,index]-1)
			evidenceDic['g'+featureName]=(seen_traindata_final_NDArray[i+15,index]-1)
		temp = model_seen_ve.map_query(variables=['label'],evidence=evidenceDic)
		model_seen_traindata_predictions.append(temp['label'])
	correctCnt = 0
	for i in range(len(model_seen_traindata_predictions)):
	    if(int(model_seen_traindata_predictions[i]) == int(seen_traindata_final_NDArray[i,30])):
	        correctCnt+=1
	accuracies["seen_train"]=correctCnt/len(model_seen_traindata_predictions)*100
	print("Bayesian Model Accuracy for Seen Training Data = "+str(accuracies["seen_train"]))
	#Testing Seen Model - Validation
	model_seen_ve = VariableElimination(model_seen)
	model_seen_validationdata_predictions = []
	for i in range(seen_validationdata_final_NDArray.shape[0]):
		evidenceDic = {}
		for index, featureName in enumerate(featureNamesList): 
			evidenceDic['f'+featureName]=seen_validationdata_final_NDArray[i,index]-1
			evidenceDic['g'+featureName]=seen_validationdata_final_NDArray[i+15,index]-1
		temp = model_seen_ve.map_query(variables=['label'],evidence=evidenceDic)
		model_seen_validationdata_predictions.append(temp['label'])
	correctCnt = 0
	for i in range(len(model_seen_validationdata_predictions)):
	    if(int(model_seen_validationdata_predictions[i]) == int(seen_validationdata_final_NDArray[i,30])):
	        correctCnt+=1
	accuracies["seen_validation"]=correctCnt/len(model_seen_validationdata_predictions)*100
	print("Bayesian Model Accuracy for Seen Validation Data = "+str(accuracies["seen_validation"]))
	#Training Shuffled Model
	model_shuffled.fit(shuffled_traindata_final)
	estimator_shuffled = BayesianEstimator(model_shuffled, shuffled_traindata_final)
	cpds=[]
	for featureName in featureNamesList :
		cpd = estimator_shuffled.estimate_cpd('f'+featureName)
		cpds.append(cpd)
		cpd = estimator_shuffled.estimate_cpd('g'+featureName)
		cpds.append(cpd)
	cpd = estimator_shuffled.estimate_cpd('label')
	cpds.append(cpd)
	model_shuffled.add_cpds(*cpds)
	#Testing Shuffled Model - Training
	model_shuffled_ve = VariableElimination(model_shuffled)
	model_shuffled_traindata_predictions = []
	for i in range(shuffled_traindata_final_NDArray.shape[0]):
		evidenceDic = {}
		for index, featureName in enumerate(featureNamesList): 
			evidenceDic['f'+featureName]=shuffled_traindata_final_NDArray[i,index]-1
			evidenceDic['g'+featureName]=shuffled_traindata_final_NDArray[i+15,index]-1
		temp = model_shuffled_ve.map_query(variables=['label'],evidence=evidenceDic)
		model_shuffled_traindata_predictions.append(temp['label'])
	correctCnt = 0
	for i in range(len(model_shuffled_traindata_predictions)):
	    if(int(model_shuffled_traindata_predictions[i]) == int(shuffled_traindata_final_NDArray[i,30])):
	        correctCnt+=1
	accuracies["shuffled_train"]=correctCnt/len(model_shuffled_traindata_predictions)*100
	print("Bayesian Model Accuracy for Shuffled Training Data = "+str(accuracies["shuffled_train"]))
	#Testing Shuffled Model - Validation
	model_shuffled_ve = VariableElimination(model_shuffled)
	model_shuffled_validationdata_predictions = []
	for i in range(shuffled_validationdata_final_NDArray.shape[0]):
		evidenceDic = {}
		for index, featureName in enumerate(featureNamesList): 
			evidenceDic['f'+featureName]=shuffled_validationdata_final_NDArray[i,index]-1
			evidenceDic['g'+featureName]=shuffled_validationdata_final_NDArray[i+15,index]-1
	temp = model_shuffled_ve.map_query(variables=['label'],evidence=evidenceDic)
	model_shuffled_validationdata_predictions.append(temp['label'])
	correctCnt = 0
	for i in range(len(model_shuffled_validationdata_predictions)):
	    if(int(model_shuffled_validationdata_predictions[i]) == int(shuffled_validationdata_final_NDArray[i,30])):
	        correctCnt+=1
	accuracies["shuffled_validation"]=correctCnt/len(model_shuffled_validationdata_predictions)*100
	print("Bayesian Model Accuracy for Shuffled Validation Data = "+str(accuracies["shuffled_validation"]))
	#Training Unseen Model
	model_unseen.fit(unseen_traindata_final)
	estimator_unseen = BayesianEstimator(model_unseen, unseen_traindata_final)
	cpds=[]
	for featureName in featureNamesList :
		cpd = estimator_unseen.estimate_cpd('f'+featureName)
		cpds.append(cpd)
		cpd = estimator_unseen.estimate_cpd('g'+featureName)
		cpds.append(cpd)
	cpd = estimator_unseen.estimate_cpd('label')
	cpds.append(cpd)
	model_unseen.add_cpds(*cpds)
	#Testing Unseen Model - Training
	model_unseen_ve = VariableElimination(model_unseen)
	model_unseen_traindata_predictions = []
	for i in range(unseen_traindata_final_NDArray.shape[0]):
		evidenceDic = {}
		for index, featureName in enumerate(featureNamesList): 
			evidenceDic['f'+featureName]=unseen_traindata_final_NDArray[i,index]-1
			evidenceDic['g'+featureName]=unseen_traindata_final_NDArray[i+15,index]-1
		temp = model_unseen_ve.map_query(variables=['label'],evidence=evidenceDic)
		model_unseen_traindata_predictions.append(temp['label'])
	correctCnt = 0
	for i in range(len(model_unseen_traindata_predictions)):
	    if(int(model_unseen_traindata_predictions[i]) == int(unseen_traindata_final_NDArray[i,30])):
	        correctCnt+=1
	accuracies["unseen_train"]=correctCnt/len(model_unseen_traindata_predictions)*100
	print("Bayesian Model Accuracy for Unseen Training Data = "+str(accuracies["unseen_train"]))
	#Testing Unseen Model - Validation
	model_unseen_ve = VariableElimination(model_unseen)
	model_unseen_validationdata_predictions = []
	for i in range(unseen_validationdata_final_NDArray.shape[0]):
		evidenceDic = {}
		for index, featureName in enumerate(featureNamesList): 
			evidenceDic['f'+featureName]=unseen_validationdata_final_NDArray[i,index]-1
			evidenceDic['g'+featureName]=unseen_validationdata_final_NDArray[i+15,index]-1
	temp = model_unseen_ve.map_query(variables=['label'],evidence=evidenceDic)
	model_unseen_validationdata_predictions.append(temp['label'])
	correctCnt = 0
	for i in range(len(model_unseen_validationdata_predictions)):
	    if(int(model_unseen_validationdata_predictions[i]) == int(unseen_validationdata_final_NDArray[i,30])):
	        correctCnt+=1
	accuracies["unseen_validation"]=correctCnt/len(model_unseen_validationdata_predictions)*100
	print("Bayesian Model Accuracy for Unseen Validation Data = "+str(accuracies["unseen_validation"]))
Esempio n. 14
0
class TestBayesianEstimator(unittest.TestCase):
    def setUp(self):
        self.m1 = BayesianModel([("A", "C"), ("B", "C")])
        self.d1 = pd.DataFrame(data={"A": [0, 0, 1], "B": [0, 1, 0], "C": [1, 1, 0]})
        self.d2 = pd.DataFrame(
            data={
                "A": [0, 0, 1, 0, 2, 0, 2, 1, 0, 2],
                "B": ["X", "Y", "X", "Y", "X", "Y", "X", "Y", "X", "Y"],
                "C": [1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
            }
        )
        self.est1 = BayesianEstimator(self.m1, self.d1)
        self.est2 = BayesianEstimator(
            self.m1, self.d1, state_names={"A": [0, 1, 2], "B": [0, 1], "C": [0, 1, 23]}
        )
        self.est3 = BayesianEstimator(self.m1, self.d2)

    def test_estimate_cpd_dirichlet(self):
        cpd_A = self.est1.estimate_cpd(
            "A", prior_type="dirichlet", pseudo_counts=[[0], [1]]
        )
        self.assertEqual(cpd_A, TabularCPD("A", 2, [[0.5], [0.5]]))

        cpd_B = self.est1.estimate_cpd(
            "B", prior_type="dirichlet", pseudo_counts=[[9], [3]]
        )
        self.assertEqual(cpd_B, TabularCPD("B", 2, [[11.0 / 15], [4.0 / 15]]))

        cpd_C = self.est1.estimate_cpd(
            "C",
            prior_type="dirichlet",
            pseudo_counts=[[0.4, 0.4, 0.4, 0.4], [0.6, 0.6, 0.6, 0.6]],
        )
        self.assertEqual(
            cpd_C,
            TabularCPD(
                "C",
                2,
                [[0.2, 0.2, 0.7, 0.4], [0.8, 0.8, 0.3, 0.6]],
                evidence=["A", "B"],
                evidence_card=[2, 2],
            ),
        )

    def test_estimate_cpd_improper_prior(self):
        cpd_C = self.est1.estimate_cpd(
            "C", prior_type="dirichlet", pseudo_counts=[[0, 0, 0, 0], [0, 0, 0, 0]]
        )
        cpd_C_correct = TabularCPD(
            "C",
            2,
            [[0.0, 0.0, 1.0, np.NaN], [1.0, 1.0, 0.0, np.NaN]],
            evidence=["A", "B"],
            evidence_card=[2, 2],
            state_names={"A": [0, 1], "B": [0, 1], "C": [0, 1]},
        )
        # manual comparison because np.NaN != np.NaN
        self.assertTrue(
            (
                (cpd_C.values == cpd_C_correct.values)
                | np.isnan(cpd_C.values) & np.isnan(cpd_C_correct.values)
            ).all()
        )

    def test_estimate_cpd_shortcuts(self):
        cpd_C1 = self.est2.estimate_cpd(
            "C", prior_type="BDeu", equivalent_sample_size=9
        )
        cpd_C1_correct = TabularCPD(
            "C",
            3,
            [
                [0.2, 0.2, 0.6, 1.0 / 3, 1.0 / 3, 1.0 / 3],
                [0.6, 0.6, 0.2, 1.0 / 3, 1.0 / 3, 1.0 / 3],
                [0.2, 0.2, 0.2, 1.0 / 3, 1.0 / 3, 1.0 / 3],
            ],
            evidence=["A", "B"],
            evidence_card=[3, 2],
        )
        self.assertEqual(cpd_C1, cpd_C1_correct)

        cpd_C2 = self.est3.estimate_cpd("C", prior_type="K2")
        cpd_C2_correct = TabularCPD(
            "C",
            2,
            [
                [0.5, 0.6, 1.0 / 3, 2.0 / 3, 0.75, 2.0 / 3],
                [0.5, 0.4, 2.0 / 3, 1.0 / 3, 0.25, 1.0 / 3],
            ],
            evidence=["A", "B"],
            evidence_card=[3, 2],
        )
        self.assertEqual(cpd_C2, cpd_C2_correct)

    def test_get_parameters(self):
        cpds = set(
            [
                self.est3.estimate_cpd("A"),
                self.est3.estimate_cpd("B"),
                self.est3.estimate_cpd("C"),
            ]
        )
        self.assertSetEqual(set(self.est3.get_parameters()), cpds)

    def test_get_parameters2(self):
        pseudo_counts = {
            "A": [[1], [2], [3]],
            "B": [[4], [5]],
            "C": [[6, 6, 6, 6, 6, 6], [7, 7, 7, 7, 7, 7]],
        }
        cpds = set(
            [
                self.est3.estimate_cpd(
                    "A", prior_type="dirichlet", pseudo_counts=pseudo_counts["A"]
                ),
                self.est3.estimate_cpd(
                    "B", prior_type="dirichlet", pseudo_counts=pseudo_counts["B"]
                ),
                self.est3.estimate_cpd(
                    "C", prior_type="dirichlet", pseudo_counts=pseudo_counts["C"]
                ),
            ]
        )
        self.assertSetEqual(
            set(
                self.est3.get_parameters(
                    prior_type="dirichlet", pseudo_counts=pseudo_counts
                )
            ),
            cpds,
        )

    def tearDown(self):
        del self.m1
        del self.d1
        del self.d2
        del self.est1
        del self.est2
print(mle.estimate_cpd(node='FIO2'))
print(mle.estimate_cpd(node='CVP'))

# Estimating CPDs for all the nodes in the model
print(mle.get_parameters()[:10]) # Show just the first 10 CPDs in the output

# Verifying that the learned parameters are almost equal.
import numpy as np
print(np.allclose(alarm_model.get_cpds('FIO2').values, mle.estimate_cpd('FIO2').values, atol=0.01))

# Fitting the using Bayesian Estimator
from pgmpy.estimators import BayesianEstimator

best = BayesianEstimator(model=model_struct, data=samples)

print(best.estimate_cpd(node='FIO2', prior_type="BDeu", equivalent_sample_size=1000))
# Uniform pseudo count for each state. Can also accept an array of the size of CPD.
print(best.estimate_cpd(node='CVP', prior_type="dirichlet", pseudo_counts=100))

# Learning CPDs for all the nodes in the model. For learning all parameters with BDeU prior, a dict of
# pseudo_counts need to be provided
print(best.get_parameters(prior_type="BDeu", equivalent_sample_size=1000)[:10])

# Shortcut for learning all the parameters and adding the CPDs to the model.

model_struct = BayesianModel(ebunch=alarm_model.edges())
model_struct.fit(data=samples, estimator=MaximumLikelihoodEstimator)
print(model_struct.get_cpds('FIO2'))

model_struct = BayesianModel(ebunch=alarm_model.edges())
model_struct.fit(data=samples, estimator=BayesianEstimator, prior_type='BDeu', equivalent_sample_size=1000)
Esempio n. 16
0
    def opt(self, file1, file2):
        f1 = open(file1, encoding="utf8")
        lines = f1.readlines()
        nodes = self.getegdes(lines[0])
        edges = self.getegdes(lines[1])
        data = pd.read_csv(file2)

        G = BayesianModel()
        G.add_nodes_from(nodes)
        for i in range(int(len(edges) / 2)):
            G.add_edge(edges[2 * i], edges[2 * i + 1])
        # nx.draw(G)
        # plt.show()
        k2 = K2Score(data).score(G)
        bic = BicScore(data).score(G)
        bdeu = BDeuScore(data).score(G)
        print(k2, ",", bic, ",", bdeu)

        est = HillClimbSearch(data, scoring_method=K2Score(data))
        model = est.estimate()
        model_edges = model.edges()
        G_ = nx.DiGraph()
        G_.add_edges_from(model_edges)
        G_copy = nx.DiGraph()
        G_copy.add_edges_from(G.edges)
        add = []
        add_mut = []
        delete = []
        delete_mut = []
        # a = list(G.edges._adjdict.key())
        for edge in model_edges:
            node1 = edge[0]
            node2 = edge[1]
            if not nx.has_path(G, node2, node1):
                if not G.has_edge(node1, node2):
                    this = (node1, node2)
                    # this = '('+node1+','+node2+')'
                    add.append(this)
                    x = data[node1]
                    mut = mr.mutual_info_score(data[node1], data[node2])
                    add_mut.append(mut)
        seq = list(zip(add_mut, add))
        seq = sorted(seq, key=lambda s: s[0], reverse=True)
        alpha = 0.015
        # if seq[0][0] > alpha:
        #     add = seq[0:1]

        add = seq[0:1]

        data_edges = []
        for edge in G.edges:
            node1 = edge[0]
            node2 = edge[1]
            mut = mr.mutual_info_score(data[node1], data[node2])
            delete_mut.append(mut)
            data_edges.append(edge)
            # if not (nx.has_path(G_, node1, node2) or nx.has_path(G_, node2, node1)):
            #     this = '('+node1+','+node2+')'
            #     delete.append(this)
        seq = list(zip(delete_mut, data_edges))
        seq = sorted(seq, key=lambda s: s[0])

        # if seq[0][0] < alpha:
        #     delete = seq[0:1]
        if len(edges) > 2:
            delete = seq[0:1]
            if len(add) > 0:
                if delete[0][0] > add[0][0]:
                    delete = []

        print('add')
        for i in add:
            print(str(i[1]) + "," + str(i[0]))

        print('delete')
        for j in delete:
            print(str(j[1]) + "," + str(j[0]))
            # print(j[0])

        print('cpt')
        estimator = BayesianEstimator(G, data)
        for i in G.nodes:
            cpd = estimator.estimate_cpd(i, prior_type="K2")
            nodeName = i
            values = dict(data[i].value_counts())
            valueNum = len(values)
            CPT = np.transpose(cpd.values)
            # CPT = cpd.values
            sequence = cpd.variables[1::]
            card = []
            for x in sequence:
                s = len(dict(data[x].value_counts()))
                card.append(s)
            output = nodeName + '\t' + str(valueNum) + '\t' + str(
                CPT.tolist()) + '\t' + str(sequence) + '\t' + str(card)
            print(output)

        print('mutual')
        output1 = []
        for i in range(int(len(edges) / 2)):
            mut = mr.mutual_info_score(data[edges[2 * i]],
                                       data[edges[2 * i + 1]])
            output1.append(mut)
        output2 = {}
        for node1 in G.nodes():
            d = {}
            for node2 in G.nodes():
                if node1 == node2:
                    continue
                mut = mr.mutual_info_score(data[node1], data[node2])

                d[node2] = mut
            output2[node1] = d
        print(output1)
        print(output2)
Esempio n. 17
0
hc = HillClimbSearch(df, scoring_method=K2Score(df))
best_model = hc.estimate()

print(best_model.edges())

# In[163]:

# Bayesian Model and parameter estimation
model1 = BayesianModel([('f3', 'f4'), ('f3', 'f9'), ('f3', 'f8'), ('f5', 'f9'),
                        ('f5', 'f3'), ('f9', 'f8'), ('f9', 'f7'), ('f9', 'f1'),
                        ('f9', 'f6'), ('f9', 'f2'), ('f9', 'f4')])

# Bayesian Parameter Estimation
est = BayesianEstimator(model1, df)

cpd_f1 = est.estimate_cpd('f1', prior_type='K2', equivalent_sample_size=50)
cpd_f2 = est.estimate_cpd('f2', prior_type='K2', equivalent_sample_size=50)
cpd_f3 = est.estimate_cpd('f3', prior_type='K2', equivalent_sample_size=50)
cpd_f4 = est.estimate_cpd('f4', prior_type='K2', equivalent_sample_size=50)
cpd_f5 = est.estimate_cpd('f5', prior_type='K2', equivalent_sample_size=50)
cpd_f6 = est.estimate_cpd('f6', prior_type='K2', equivalent_sample_size=50)
cpd_f7 = est.estimate_cpd('f7', prior_type='K2', equivalent_sample_size=50)
cpd_f8 = est.estimate_cpd('f8', prior_type='K2', equivalent_sample_size=50)
cpd_f9 = est.estimate_cpd('f9', prior_type='K2', equivalent_sample_size=50)

# Associating the CPDs with the network
model1.add_cpds(cpd_f1, cpd_f2, cpd_f3, cpd_f4, cpd_f5, cpd_f6, cpd_f7, cpd_f8,
                cpd_f9)

# check_model checks for the network structure and CPDs and verifies that the CPDs are correctly
# defined and sum to 1.
Esempio n. 18
0
 def estimate_parameters(self):
     data = pd.DataFrame(data=self.learning_data)
     estimator = BayesianEstimator(self.pgmpy, data)
     for i, node in enumerate(self.nodes):
         if 'LAN' in node[0] or 'MOTOR' in node[0] or 'WORLD' in node[0]:
             self.pgmpy.get_cpds(node[0]).values = estimator.estimate_cpd('WORLD_0', prior_type='dirichlet', pseudo_counts=[2, 3]).values
Esempio n. 19
0
from pgmpy.models import BayesianModel
from pgmpy.estimators import BayesianEstimator

#test 1
data = pd.DataFrame(data={
    'A': [0.0, 0.0, 1.0],
    'B': [0.0, 1.0, 0.0],
    'C': [1.0, 1.0, 0.0]
})
#data = pd.DataFrame(data={'A': [0, 0, 1], 'B': [0, 1, 0], 'C': [1, 1, 0]})

print(data)
model = BayesianModel([('A', 'C'), ('B', 'C')])
estimator = BayesianEstimator(model, data)
cpd_C = estimator.estimate_cpd('C',
                               prior_type="dirichlet",
                               pseudo_counts=[1, 2])
print(cpd_C)

#test 2
values = pd.DataFrame(np.random.randint(low=0, high=2, size=(1000, 4)),
                      columns=['A', 'B', 'C', 'D'])
model = BayesianModel([('A', 'B'), ('C', 'B'), ('C', 'D')])
estimator = BayesianEstimator(model, values)
a = estimator.get_parameters(prior_type='BDeu', equivalent_sample_size=5)
for i in a:
    print(i)
#print(a)
#print(type(a))
#print(len(a))
#print(a[0])
Esempio n. 20
0
passive_users = "passive, "*24

active_users = [elem for elem in active_users.strip().split(",") if elem != '']
passive_users = [elem for elem in passive_users.strip().split(",") if elem != '']

data = pd.DataFrame(data = {'last_activity' : high + medium + low, 'duration': dhigh + dmedium + dlow, 'pages_viewed': pvhigh + pvmedium + pvlow, 'user_type' : active_users + passive_users })

model = BayesianModel([ 
	('last_activity', 'duration'),
	('duration', 'pages_viewed'), 
	('pages_viewed', 'user_type')])

pe = ParameterEstimator(model, data)

#print("\n", pe.state_counts('last_activity'))  # unconditional
#print("\n", pe.state_counts('user_type'))  # conditional on fruit and size

mle = MaximumLikelihoodEstimator(model, data)
#print(mle.estimate_cpd('last_activity'))  # unconditional
#print(mle.estimate_cpd('user_type'))  # conditional


# Calibrate all CPDs of `model` using MLE:
model.fit(data)

est = BayesianEstimator(model, data)

result = est.estimate_cpd('user_type', prior_type='BDeu', equivalent_sample_size=10)
import code
code.interact(local=locals())
Esempio n. 21
0
def distribution(excel_rows, item_name, items, file_name, df_cols,
                 groupby_cols, bp_group):
    # Using 95% confidence interval
    # (1-0.95)/2
    Z_score = abs(st.norm.ppf(0.025))
    alpha = 1 - 0.95
    data_files = {}
    Orientations = ["left", "right"]

    # create dataframe
    for item in items:
        if item_name == "Monkey":
            df = (monkey_df[(monkey_df.Monkey == item)])
        elif item_name == "gender":
            df = (gender_df[(gender_df.gender == item)])
        z = BayesianEstimator(model, df)
        cat_cpd = z.estimate_cpd('Orientation',
                                 prior_type="bdeu",
                                 equivalent_sample_size=6)  # .to_factor()
        for left in categories:
            for right in categories:
                for cat in Orientations:
                    try:
                        count = z.state_counts('Orientation')[left][right][cat]
                        prob = cat_cpd.get_value(
                            **{
                                'Left_categ': left,
                                'Right_categ': right,
                                'Orientation': cat
                            })

                        # p_hat and q_hat set to conservative since we have no previous data #0.5 for each
                        # Since its probability I clip to 0
                        lower_ci = max(
                            prob - Z_score * math.sqrt(
                                (0.5 * 0.5) / df.shape[0]), 0)
                        upper_ci = prob + Z_score * math.sqrt(
                            (0.5 * 0.5) / df.shape[0])
                        if not isNaN(prob) and prob > 0:
                            excel_rows.append([
                                item, left, right, cat, count, prob, lower_ci,
                                upper_ci, alpha
                            ])
                        else:
                            pass
                            # excel_rows.append([item, left, right, cat, count, prob, 0, 0, 0])
                    except KeyError:
                        pass
                        # excel_rows.append([item, left, right, cat, count, 0, 0 , 0, 0])

    prob_df = pd.DataFrame.from_records(excel_rows[1:], columns=excel_rows[0])
    gen_df = prob_df[df_cols].groupby(groupby_cols)['Count'].agg(
        ['sum'])  # .reset_index()

    ax, bp = gen_df.boxplot(rot=90,
                            fontsize=12,
                            figsize=(16, 10),
                            column=['sum'],
                            by=bp_group,
                            return_type="both")[0]
    plt.title(item_name.capitalize() + " Box plot grouped by : " +
              str(bp_group))
    plt.suptitle('')
    plt.ylabel("sum")

    # group = ['Left-Category', 'Category']
    # ax, bp = gen_df.boxplot(rot=90, fontsize=12, figsize=(24, 12), column=['sum'], by=group, return_type="both")[0]
    # plt.title("Box plot grouped by : " + str(group))
    # plt.suptitle('')
    # plt.ylabel("sum")
    #
    #
    # group = ['Right-Category', 'Category']
    # ax, bp = gen_df.boxplot(rot=90, fontsize=12, figsize=(24, 12), column=['sum'], by=group, return_type="both")[0]
    # plt.title("Box plot grouped by : " + str(group))
    # plt.suptitle('')
    # plt.ylabel("sum")

    writer = pd.ExcelWriter(file_name + ".xlsx")
    prob_df.to_excel(writer, sheet_name='Distribution')
    prob_df.sort_values('Probability', ascending=False).drop_duplicates(
        [item_name]).to_excel(writer, sheet_name='prefference')
    writer.save()

    plt.savefig(file_name + ".png", dpi=100)
    plt.show()
    plt.clf()