def create(self, network_factory): network = network_factory.create() cluster = builder.try_get_node(network, "Cluster") if cluster is None: cluster = builder.create_cluster_variable( network, self._latent_states, variable_name=self._latent_variable_name) if not dk.empty(self._continuous): for c_name in self._continuous.columns: self._logger.info("Pre-processing {} column".format(c_name)) c = builder.create_continuous_variable(network, c_name) try: builder.create_link(network, cluster, c) except ValueError as e: self._logger.warn(e) if not dk.empty(self._discrete): for d_name in self._discrete.columns: if d_name in self._discrete_states: states = self._discrete_states[str(d_name)] else: states = dk.compute(self._discrete[str( d_name)].dropna().unique()).tolist() try: c = builder.create_discrete_variable( network, self._discrete, str(d_name), states) builder.create_link(network, cluster, c) except BaseException as e: self._logger.warn(e) return network
def create(self, network_factory): network = network_factory.create() if not dk.empty(self._continuous): for c_name in self._continuous.columns: c = builder.create_continuous_variable(network, c_name) if dk.empty(self._discrete): for d_name in self._discrete.columns: if d_name in self._discrete_states: states = self._discrete_states[d_name] else: states = dk.compute( self._discrete[d_name].dropna().unique()).tolist() try: c = builder.create_discrete_variable( network, self._discrete, d_name, states) except BaseException as e: self._logger.warn(e) parent_node = builder.try_get_node(network, self._parent_node) if parent_node is None: raise ValueError("Parent node: {} not recognised".format( self._parent_node)) for node in network.getNodes(): if node == parent_node: continue builder.create_link(network, parent_node, node) return network
def main(): logger = logging.getLogger() bayesianpy.jni.attach(logger) logger = logging.getLogger() logger.addHandler(logging.StreamHandler()) logger.setLevel(logging.INFO) bayesianpy.jni.attach(logger) db_folder = bayesianpy.utils.get_path_to_parent_dir(__file__) titanic_dask = dd.read_csv(os.path.join(db_folder, "data/titanic.csv")) auto = bayesianpy.data.AutoType(titanic_dask) network_factory = bayesianpy.network.NetworkFactory(logger) discrete = auto.get_discrete_variables() continuous = auto.get_continuous_variables() # write data to the temporary sqllite db with bayesianpy.data.DataSet(titanic_dask, db_folder, logger) as dataset: # learn the model structure using built-in algorithm # Or, use a standard template, which generally gives good performance mixture_naive_bayes_tpl = bayesianpy.template.MixtureNaiveBayes(logger, discrete=titanic_dask[discrete], continuous=titanic_dask[continuous]) model = bayesianpy.model.NetworkModel( mixture_naive_bayes_tpl.create(network_factory), logger) # result contains a bunch of metrics regarding the training step model.train(dataset.subset(dk.compute(titanic_dask.index).tolist())) # note that we've not 'dropped' the target data anywhere, this will be retracted when it's queried, # by specifying query_options.setQueryEvidenceMode(bayesServerInference().QueryEvidenceMode.RETRACT_QUERY_EVIDENCE) results = model.batch_query(dataset.subset(dk.compute(titanic_dask.index).tolist()), [ bayesianpy.model.QueryMostLikelyState("Survived", output_dtype=titanic_dask['Survived'].dtype)]) # Each query just appends a column/ columns on to the original dataframe, so results is the same as titanic.iloc[test_indexes], # with (in this case) one additional column called 'Survived_maxlikelihood', joined to the original. score = accuracy_score(y_pred=dk.compute(results['Survived_maxlikelihood']).tolist(), y_true=dk.compute(results['Survived']).tolist()) logger.info("Score was {}.".format(score))
def could_be_int(col): if DataFrame.is_int(col.dtype): return True if DataFrame.is_float(col.dtype): for val in dk.compute(col.dropna().unique()): if int(val) != val: return False return True return False
def get_discrete_variables(self): continuous = set(self.get_continuous_variables()) for col in self._df.columns.tolist(): l = len(dk.compute(self._df[str(col)].unique())) if col in self._continuous: continue elif l > self._max_states or l <= 1: continue elif DataFrame.is_timestamp(self._df[str(col)].dtype): continue elif col in self._discrete: yield str(col) elif col not in continuous: yield str(col)
def create(self, network_factory: bayesianpy.network.NetworkFactory): network = network_factory.create() cluster = builder.create_cluster_variable(network, self._latent_states) if not dk.empty(self._continuous): for c_name in self._continuous.columns: c = builder.create_discretised_variable(network, self._continuous, c_name, bin_count=self._bin_count, mode=self._binning_mode, zero_crossing=self._zero_crossing) builder.create_link(network, cluster, c) if not dk.empty(self._discrete): for d_name in self._discrete.columns: states = dk.compute(self._discrete[d_name].dropna().unique()) c = builder.create_discrete_variable(network, self._discrete, d_name, states) builder.create_link(network, cluster, c) return network