Esempio n. 1
0
    def create(self, network_factory):
        network = network_factory.create()
        cluster = builder.try_get_node(network, "Cluster")
        if cluster is None:
            cluster = builder.create_cluster_variable(
                network,
                self._latent_states,
                variable_name=self._latent_variable_name)

        if not dk.empty(self._continuous):
            for c_name in self._continuous.columns:
                self._logger.info("Pre-processing {} column".format(c_name))
                c = builder.create_continuous_variable(network, c_name)
                try:
                    builder.create_link(network, cluster, c)
                except ValueError as e:
                    self._logger.warn(e)

        if not dk.empty(self._discrete):
            for d_name in self._discrete.columns:
                if d_name in self._discrete_states:
                    states = self._discrete_states[str(d_name)]
                else:
                    states = dk.compute(self._discrete[str(
                        d_name)].dropna().unique()).tolist()

                try:
                    c = builder.create_discrete_variable(
                        network, self._discrete, str(d_name), states)

                    builder.create_link(network, cluster, c)
                except BaseException as e:
                    self._logger.warn(e)

        return network
Esempio n. 2
0
    def create(self, network_factory):
        network = network_factory.create()

        if not dk.empty(self._continuous):
            for c_name in self._continuous.columns:
                c = builder.create_continuous_variable(network, c_name)

        if dk.empty(self._discrete):
            for d_name in self._discrete.columns:
                if d_name in self._discrete_states:
                    states = self._discrete_states[d_name]
                else:
                    states = dk.compute(
                        self._discrete[d_name].dropna().unique()).tolist()

                try:
                    c = builder.create_discrete_variable(
                        network, self._discrete, d_name, states)
                except BaseException as e:
                    self._logger.warn(e)

        parent_node = builder.try_get_node(network, self._parent_node)
        if parent_node is None:
            raise ValueError("Parent node: {} not recognised".format(
                self._parent_node))

        for node in network.getNodes():
            if node == parent_node:
                continue
            builder.create_link(network, parent_node, node)

        return network
Esempio n. 3
0
def main():

    logger = logging.getLogger()

    bayesianpy.jni.attach(logger)

    logger = logging.getLogger()
    logger.addHandler(logging.StreamHandler())
    logger.setLevel(logging.INFO)

    bayesianpy.jni.attach(logger)

    db_folder = bayesianpy.utils.get_path_to_parent_dir(__file__)
    titanic_dask = dd.read_csv(os.path.join(db_folder, "data/titanic.csv"))

    auto = bayesianpy.data.AutoType(titanic_dask)
    network_factory = bayesianpy.network.NetworkFactory(logger)

    discrete = auto.get_discrete_variables()
    continuous = auto.get_continuous_variables()

    # write data to the temporary sqllite db
    with bayesianpy.data.DataSet(titanic_dask, db_folder, logger) as dataset:
        # learn the model structure using built-in algorithm

        # Or, use a standard template, which generally gives good performance
        mixture_naive_bayes_tpl = bayesianpy.template.MixtureNaiveBayes(logger, discrete=titanic_dask[discrete],
                                                                        continuous=titanic_dask[continuous])

        model = bayesianpy.model.NetworkModel(
                mixture_naive_bayes_tpl.create(network_factory),
                logger)

        # result contains a bunch of metrics regarding the training step
        model.train(dataset.subset(dk.compute(titanic_dask.index).tolist()))

        # note that we've not 'dropped' the target data anywhere, this will be retracted when it's queried,
        # by specifying query_options.setQueryEvidenceMode(bayesServerInference().QueryEvidenceMode.RETRACT_QUERY_EVIDENCE)
        results = model.batch_query(dataset.subset(dk.compute(titanic_dask.index).tolist()), [
                bayesianpy.model.QueryMostLikelyState("Survived", output_dtype=titanic_dask['Survived'].dtype)])

        # Each query just appends a column/ columns on to the original dataframe, so results is the same as titanic.iloc[test_indexes],
        # with (in this case) one additional column called 'Survived_maxlikelihood', joined to the original.
        score = accuracy_score(y_pred=dk.compute(results['Survived_maxlikelihood']).tolist(),
                                    y_true=dk.compute(results['Survived']).tolist())

        logger.info("Score was {}.".format(score))
Esempio n. 4
0
    def could_be_int(col):
        if DataFrame.is_int(col.dtype):
            return True

        if DataFrame.is_float(col.dtype):
            for val in dk.compute(col.dropna().unique()):
                if int(val) != val:
                    return False

            return True

        return False
Esempio n. 5
0
 def get_discrete_variables(self):
     continuous = set(self.get_continuous_variables())
     for col in self._df.columns.tolist():
         l = len(dk.compute(self._df[str(col)].unique()))
         if col in self._continuous:
             continue
         elif l > self._max_states or l <= 1:
             continue
         elif DataFrame.is_timestamp(self._df[str(col)].dtype):
             continue
         elif col in self._discrete:
             yield str(col)
         elif col not in continuous:
             yield str(col)
Esempio n. 6
0
    def create(self, network_factory: bayesianpy.network.NetworkFactory):
        network = network_factory.create()
        cluster = builder.create_cluster_variable(network, self._latent_states)

        if not dk.empty(self._continuous):
            for c_name in self._continuous.columns:
                c = builder.create_discretised_variable(network, self._continuous, c_name, bin_count=self._bin_count,
                                                        mode=self._binning_mode, zero_crossing=self._zero_crossing)

                builder.create_link(network, cluster, c)

        if not dk.empty(self._discrete):
            for d_name in self._discrete.columns:
                states = dk.compute(self._discrete[d_name].dropna().unique())
                c = builder.create_discrete_variable(network, self._discrete, d_name, states)
                builder.create_link(network, cluster, c)

        return network