def testLocalSearchWithTabu(self): with self.assertRaises(gum.IOError): learner = gum.BNLearner("shouldNotExist.csv") try: learner = gum.BNLearner("shouldNotExist.csv") except gum.IOError: self.assertTrue(True) pass else: self.assertTrue(False)
def testPseudoCount(self): filename = self.agrumSrcDir('dataW.csv') with open(filename, "w") as src: src.write("""X,Y,Z 0,1,2 0,1,0 0,0,2 1,0,2 0,1,1 1,1,1 0,1,1 """) learner = gum.BNLearner(filename) self.assertEqual(learner.nbRows(), 7) self.assertEqual(learner.nbCols(), 3) self.assertEqual(learner.rawPseudoCount(["X"]), (5, 2)) self.assertEqual(learner.rawPseudoCount(["X", "Z"]), (1, 0, 2, 1, 2, 1)) self.assertEqual(learner.rawPseudoCount(["Y", "Z"]), (0, 1, 0, 3, 2, 1)) learner.useSmoothingPrior(0.1) self.assertEqual(learner.rawPseudoCount(["X"]), (5.1, 2.1)) self.assertEqual(learner.rawPseudoCount(["X", "Z"]), (1.1, 0.1, 2.1, 1.1, 2.1, 1.1)) self.assertEqual(learner.rawPseudoCount(["Y", "Z"]), (0.1, 1.1, 0.1, 3.1, 2.1, 1.1)) learner = gum.BNLearner(filename) self.assertEqual(learner.pseudoCount(["X"]).tolist(), [5, 2]) self.assertEqual( learner.pseudoCount(["X", "Z"]).tolist(), [[1, 0], [2, 1], [2, 1]]) self.assertEqual( learner.pseudoCount(["Y", "Z"]).tolist(), [[0, 1], [0, 3], [2, 1]]) self.assertEqual( learner.pseudoCount(["Z", "Y"]).tolist(), [[0, 0, 2], [1, 3, 1]]) learner.useSmoothingPrior(0.1) self.assertEqual(learner.pseudoCount(["X"]).tolist(), [5.1, 2.1]) self.assertEqual( learner.pseudoCount(["X", "Z"]).tolist(), [[1.1, 0.1], [2.1, 1.1], [2.1, 1.1]]) self.assertEqual( learner.pseudoCount(["Y", "Z"]).tolist(), [[0.1, 1.1], [0.1, 3.1], [2.1, 1.1]]) self.assertEqual( learner.pseudoCount(["Z", "Y"]).tolist(), [[0.1, 0.1, 2.1], [1.1, 3.1, 1.1]])
def GSMN(mn, fileName, threshold=0.05): """ Learn the structure of a markov network, using GSMN algorithm on a given the database Examples -------- >>> mn=mnl.GSMN(template,"./samples/sampleMN.csv",0.0001) Parameters ---------- mn : pyAgrum.MarkovNet the template of the markov network fileName : str the other markov network threshold : float default value : 0.05, hyperparameter used for the statistical test Returns ------- pyAgrum.MarkovNet the learned markov network """ V = mn.names() mnVariables = dict() for name in V: mnVariables[name] = mn.variableFromName(name) MB = dict() learner = gum.BNLearner(fileName) for variable in V: MB[variable] = GS(variable, V, learner, threshold) correctError(MB) mn = dictToMarkovNetwork(MB, mnVariables) return mn
def test3off2(self): learner = gum.BNLearner(self.agrumSrcDir('asia.csv')) learner.use3off2() learner.useNMLCorrection() learner.addForbiddenArc(4, 1) learner.addMandatoryArc(7, 5) d = gum.DAG() for i in range(8): d.addNodeWithId(i) learner.setInitialDAG(d) self.assertNotEqual(len(learner.names()), 0) try: bn = learner.learnBN() except: self.fail("Exception has been raised and should not") self.assertEqual(len(bn.arcs()), 9) self.assertFalse(bn.dag().existsArc(4, 1)) self.assertTrue(bn.dag().existsArc(7, 5)) try: mg = learner.learnMixedStructure() except: self.fail("Exception has been raised and should not") self.assertEqual(mg.sizeArcs(), 8) self.assertEqual(mg.sizeEdges(), 1) self.assertFalse(bn.dag().existsArc(4, 1)) self.assertTrue(bn.dag().existsArc(7, 5)) self.assertEqual(len(learner.latentVariables()), 2)
def run_bn_unsup(train_corr, test_corr, structure): """" This method first learns a BN based on train_corr, then it propagates evidence from test_corr through it, after which a new data set is created based on the new posteriors :param train_corr: training-data, not in one-hot encoding form! :param test_corr: test-data that is being updated, in one-hot encoding form :param structure: structure of the data (how many categories each attribute has) """ structure_0 = [0] + structure # Learn the BN based on train_corr learner = gum.BNLearner(train_corr) learner.useScoreBDeu() bn = learner.learnBN() # Create a placeholder for the net_data new_data = np.zeros(test_corr.shape) for i in range(test_corr.shape[0]): dp = test_corr[i, :] # fix an observation evs = {} k = 0 for n in bn.nodes( ): # Convert the evidence to a dictionary structure needed for propagation evs[n] = dp[sum(structure_0[:k + 1]):sum(structure_0[:k + 2])] k += 1 ie = gum.LazyPropagation(bn) ie.setEvidence(evs) # set the evidence pst = [ie.posterior(n).toarray() for n in bn.nodes() ] # Extract the posteriors and store them in new_data new_data[i, :] = list(itertools.chain.from_iterable(pst)) ie.eraseAllEvidence() return new_data
def testHillClimbing(self): learner = gum.BNLearner(self.agrumSrcDir('asia.csv')) learner.useGreedyHillClimbing() bn = learner.learnBN() self.assertEqual(bn.size(), 8) with self.assertRaises(gum.IOError): learner = gum.BNLearner("shouldNotExist.csv") try: learner = gum.BNLearner("shouldNotExist.csv") except gum.IOError: self.assertTrue(True) pass else: self.assertTrue(False)
def independenceListForPairs(bn, filename, target=None, plot=True, alphabetic=False): """ get the p-values of the chi2 test of a (as simple as possible) independence proposition for every non arc. Parameters ---------- bn : gum.BayesNet the Bayesian network filename : str the name of the csv database alphabetic : bool if True, the list is alphabetically sorted else it is sorted by the p-value target: (optional) str or int the name or id of the target variable plot : bool if True, plot the result Returns ------- the list """ learner = gum.BNLearner(filename, bn) vals = {} for indep in _independenceListForPairs(bn, target): vals[indep] = learner.chi2(*indep)[1] if plot: plotvals = dict() for indep in vals: key = "$" + indep[0] + " \\perp " + indep[1] if len(indep[2]) > 0: key += " \\mid " + ",".join(indep[2]) key += "$" plotvals[key] = vals[indep] if not alphabetic: sortedkeys = sorted(plotvals, key=plotvals.__getitem__, reverse=False) else: sortedkeys = list(plotvals.keys()) fig = pylab.figure(figsize=(10, 1 + 0.25 * len(plotvals))) ax = fig.add_subplot(1, 1, 1) ax.plot([plotvals[k] for k in sortedkeys], sortedkeys, "o") ax.grid(True) ax.vlines(x=0.05, ymin=-0.5, ymax=len(vals) - 0.5, colors='purple') ax.add_patch( mpl.patches.Rectangle((0, -0.5), 0.05, len(vals), color="yellow")) return vals
def testHybridLearning(self): learner = gum.BNLearner(self.agrumSrcDir('data1.csv')) learner.useMIIC() eg = learner.learnEssentialGraph() skel = eg.skeleton() learner = gum.BNLearner(self.agrumSrcDir('data1.csv')) learner.setPossibleSkeleton(skel) bn = learner.learnBN() self.assertEqual(bn.sizeArcs(), 4) self.assertEqual(bn.parents(bn.idFromName("V")), {bn.idFromName("A")}) self.assertEqual( bn.parents(bn.idFromName("Y")), {bn.idFromName("X"), bn.idFromName("V")}) self.assertEqual(bn.parents(bn.idFromName("Z")), {bn.idFromName("Y")})
def learnBN(file_path: str, algorithm: BN_Algorithm = BN_Algorithm.HillClimbing): learner = gum.BNLearner(file_path) if (algorithm == BN_Algorithm.HillClimbing): print_big("Selecting Greedy Hill Climbing Algorithm") learner.useGreedyHillClimbing() elif (algorithm == BN_Algorithm.LocalSearch): print_big("Selecting Local Search Algorithm") bn = learner.useLocalSearchWithTabuList() elif (algorithm == BN_Algorithm.ThreeOffTwo): print_big("Selecting 3Off2 Algorithm") learner.use3off2() elif (algorithm == BN_Algorithm.MIIC): print_big("Selecting MIIC Algorithm") learner.useMIIC() else: raise Exception('Not supported algorithm') bn = learner.learnBN() return bn
def createBayesianNetwork(): learner = gum.BNLearner("logs/Log/WholeLog.csv") learner.useK2([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]) bn2 = learner.learnBN() print("Learned in {0}s".format(learner.currentTime())) gnb.showBN(bn2) return bn2
def learn_bn(filepath, learn_algo): try: learner = agrum.BNLearner(filepath) except IOError as e: print("learn_and_create_bn - I/O error({0}): {1}".format( e.errno, e.strerror)) exit ''' Select learning method ''' if learn_algo == 'hard-coded': learner.addMandatoryArc(0, 3) learner.addMandatoryArc(1, 3) learner.addMandatoryArc(2, 3) if sim_param.inclination_param: learner.addMandatoryArc(0, 4) learner.addMandatoryArc(1, 4) learner.addMandatoryArc(2, 4) learner.addMandatoryArc(3, 4) elif learn_algo == 'hillclimbing': learner.useGreedyHillClimbing() elif learn_algo == 'tabu': learner.useLocalSearchWithTabuList() elif learn_algo == 'k2': if sim_param.inclination_param: learner.useK2([4, 3, 2, 1, 0]) else: learner.useK2([3, 2, 1, 0]) else: print( 'ERROR - learn_bn : there was a problem while selecting the learner' ) sys.exit() ''' Select score (BDEU by default)''' if sim_param.score_likelihood: learner.useScoreLog2Likelihood() learner.useAprioriSmoothing() if sim_param.score_bic: learner.useScoreBIC() learner.useAprioriSmoothing() if sim_param.score_aic: learner.useScoreAIC() learner.useAprioriSmoothing() if sim_param.score_k2: learner.useScoreK2() # learner.useAprioriSmoothing() bn = learner.learnBN() if sim_param.debug: print("BN learned.\n", bn) return bn
def testLocalSearchWithTabuAccurate(self): learner = gum.BNLearner(self.agrumSrcDir('asia.csv')) learner.useLocalSearchWithTabuList() bn = learner.learnBN() ref = gum.loadBN(self.agrumSrcDir('asia2.bif'), verbose=False) f = gum.ExactBNdistance(bn, ref) res = f.compute() self.assertAlmostEqual(res['klPQ'], 0.5, delta=0.5)
def test_setSliceOrder_with_names(self): learner = gum.BNLearner(self.agrumSrcDir('asia3.csv')) learner.setSliceOrder([["smoking?", "lung_cancer?"], ["bronchitis?", "visit_to_Asia?"], ["tuberculosis?"]]) learner = gum.BNLearner(self.agrumSrcDir('asia3.csv')) learner.setSliceOrder([[0, "lung_cancer?"], [2, "visit_to_Asia?"], ["tuberculosis?"]]) learner = gum.BNLearner(self.agrumSrcDir('asia3.csv')) with self.assertRaises(gum.DuplicateElement): learner.setSliceOrder([["smoking?", "lung_cancer?"], [0, "visit_to_Asia?"], ["tuberculosis?"]]) with self.assertRaises(gum.MissingVariableInDatabase): learner.setSliceOrder([["smoking?", "lung_cancer?"], ["bronchitis?", "CRUCRU?"], ["tuberculosis?"]])
def test_chi2(self): learner = gum.BNLearner(self.agrumSrcDir('asia3.csv')) stat, pvalue = learner.chi2("smoking?", "lung_cancer?") self.assertAlmostEqual(stat, 36.2256, delta=1e-4) self.assertAlmostEqual(pvalue, 0, delta=1e-4) stat, pvalue = learner.chi2("smoking?", "visit_to_Asia?") self.assertAlmostEqual(stat, 1.1257, delta=1e-4) self.assertAlmostEqual(pvalue, 0.2886, delta=1e-4) stat, pvalue = learner.chi2("lung_cancer?", "tuberculosis?") self.assertAlmostEqual(stat, 0.6297, delta=1e-4) self.assertAlmostEqual(pvalue, 0.4274, delta=1e-4) stat, pvalue = learner.chi2("lung_cancer?", "tuberculosis?", ["tuberculos_or_cancer?"]) self.assertAlmostEqual(stat, 58.0, delta=1e-4) self.assertAlmostEqual(pvalue, 0.0, delta=1e-4) learner2 = gum.BNLearner(self.agrumSrcDir('chi2.csv')) stat, pvalue = learner2.chi2("A", "C") self.assertAlmostEqual(stat, 0.0007, delta=1e-3) self.assertAlmostEqual(pvalue, 0.978, delta=1e-3) stat, pvalue = learner2.chi2("A", "B") self.assertAlmostEqual(stat, 21.4348, delta=1e-3) self.assertAlmostEqual(pvalue, 3.6e-6, delta=1e-5) stat, pvalue = learner2.chi2("B", "A") self.assertAlmostEqual(stat, 21.4348, delta=1e-3) self.assertAlmostEqual(pvalue, 3.6e-6, delta=1e-5) stat, pvalue = learner2.chi2("B", "D") self.assertAlmostEqual(stat, 0.903, delta=1e-3) self.assertAlmostEqual(pvalue, 0.341, delta=1e-3) stat, pvalue = learner2.chi2("A", "C", ["B"]) self.assertAlmostEqual(stat, 15.2205, delta=1e-3) self.assertAlmostEqual(pvalue, 0.0005, delta=1e-4)
def test_EM(self): learner = gum.BNLearner(self.agrumSrcDir('EM.csv'), ["#"]) self.assertFalse(learner.hasMissingValues()) learner = gum.BNLearner(self.agrumSrcDir('EM.csv'), ["?"]) self.assertTrue(learner.hasMissingValues()) dag = gum.DAG() for i in range(len(learner.names())): dag.addNodeWithId(i) dag.addArc(1, 0) dag.addArc(2, 1) dag.addArc(3, 2) with self.assertRaises(gum.MissingValueInDatabase): learner.learnParameters(dag) learner.useEM(1e-3) learner.useSmoothingPrior() learner.learnParameters(dag, False)
def testParameterLearning(self): bn = gum.loadBN(self.agrumSrcDir('asia_bool.bif'), verbose=False) learner = gum.BNLearner(self.agrumSrcDir('asia3.csv'), bn) learner.setInitialDAG(bn.dag()) learner.useScoreLog2Likelihood() learner.useSmoothingPrior(1.0) bn2 = learner.learnParameters() for i in range(bn.size()): # self.assertEqual(str(bn2.variable(i)), str(bn.variable(bn.idFromName(bn2.variable(i).name())))) self.assertEqual( set(bn2.variable(i).labels()), set( bn.variable(bn.idFromName( bn2.variable(i).name())).labels())) bn = gum.loadBN(self.agrumSrcDir('asia_bool.bif'), verbose=False) # there is a beurk modality in asia3-faulty.csv with self.assertRaises(gum.UnknownLabelInDatabase): learner = gum.BNLearner(self.agrumSrcDir('asia3-faulty.csv'), bn)
def testDBNTonda(self): dbn = gum.BayesNet() l = [ dbn.add(gum.LabelizedVariable(name, name, nbr)) for (name, nbr) in [("bf_0", 4), ("bf_t", 4), ("c_0", 5), ( "c_t", 5), ("h_0", 5), ("h_t", 5), ("tf_0", 5), ("tf_t", 5), ("wl_0", 4), ("wl_t", 4)] ] for node in ["c_t", "h_t", "wl_t"]: dbn.addArc(dbn.idFromName("tf_0"), dbn.idFromName(node)) dbn.addArc(dbn.idFromName("bf_0"), dbn.idFromName(node)) dbn.addArc(dbn.idFromName("c_0"), dbn.idFromName("c_t")) dbn.addArc(dbn.idFromName("h_0"), dbn.idFromName("h_t")) dbn.addArc(dbn.idFromName("wl_0"), dbn.idFromName("wl_t")) csvfile = self.agrumSrcDir('DBN_Tonda.csv') l1 = gum.BNLearner(csvfile) l1.setInitialDAG(dbn.dag()) l1.useScoreLog2Likelihood() l1.useSmoothingPrior() bn1 = l1.learnParameters() l2 = gum.BNLearner(csvfile, dbn) l2.setInitialDAG(dbn.dag()) l2.useScoreLog2Likelihood() l2.useSmoothingPrior() bn2 = l2.learnParameters() p1 = bn1.cpt(bn1.idFromName("c_0")) I1 = gum.Instantiation(p1) p2 = bn2.cpt(bn2.idFromName("c_0")) I2 = gum.Instantiation(p2) I1.setFirst() I2.setFirst() while not I1.end(): self.assertEqual(p1.get(I1), p2.get(I2)) I1.inc() I2.inc()
def learn_parameters(bn_struct, ficname): # création du dag correspondant au bn_struct graphe = gum.DAG() nodes = [graphe.addNode() for i in range(bn_struct.shape[0])] for i in range(bn_struct.shape[0]): for parent in bn_struct[i]: graphe.addArc(nodes[parent], nodes[i]) # appel au BNLearner pour apprendre les paramètres learner = gum.BNLearner(ficname) learner.useScoreLog2Likelihood() learner.useAprioriSmoothing() return learner.learnParameters(graphe)
def predict(self, dataset: DatasetInterface) -> List[Relation]: # Load from file as can't be used directly from a DataFrame. learner = gum.BNLearner(str(dataset.get_filepath())) # Greedy Search. if self.algorithm == self.LEARNER_GES: learner.useGreedyHillClimbing() # Tabu search. else: learner.useLocalSearchWithTabuList() return self.__build_relations(learner.learnBN())
def run(df, pc=None): """ Run the algorithm against the dataframe to return a dot string. """ dot_str = None try: fp = tempfile.NamedTemporaryFile(suffix='.csv') df.to_csv(fp.name, encoding='utf-8', index=False) learner = gum.BNLearner(fp.name) learner.useGreedyHillClimbing() bn = learner.learnBN() return bn.toDot() except Exception as e: _logger.error(str(e)) print(str(e)) return dot_str
def learnBN(file_path, algorithm="Hill Climbing"): """Given a single array from which one pretends to generate local explanations from Draw samples from a uniform distribution within a range of feature_val +- variance Returns a matrix with a number of samples (by default 300) with permutations of each feature of the input vector Parameters ---------- my_array : np.array The datapoint to be locally explained samples : int, optional The number of permutations to generate from the original vector (default is 300) variance : int, optional Quantity to permute in each feature (default is 0.25) Returns ------- permutations : matrix a 2-D matrix with dimensions (samples, features) with all the permutations of the original vector """ learner = gum.BNLearner(file_path) if (algorithm == "Hill Climbing"): print("Selecting Greedy Hill Climbing Algorithm") learner.useGreedyHillClimbing() if (algorithm == "Local Search"): print("Selecting Local Search Algorithm") bn = learner.useLocalSearchWithTabuList() if (algorithm == "3off2"): print("Selecting 3Off2 Algorithm") learner.use3off2() if (algorithm == "miic"): print("Selecting MIIC Algorithm") learner.useMIIC() learner.learnBN() bn = learner.learnBN() essencGraph = gum.EssentialGraph(bn) infoBN = gnb.getInformation(bn) return [bn, infoBN, essencGraph]
def run(df, pc=None): """ Run the algorithm against the dataframe and gets a list of unobserved latent edges. """ try: fp = tempfile.NamedTemporaryFile(suffix='.csv') df.to_csv(fp.name, encoding='utf-8', index=False) learner = gum.BNLearner(fp.name) learner.useMIIC() bn = learner.learnBN() latent_edges = [] latent_edges.extend([(bn.variable(i).name(), bn.variable(j).name()) for (i, j) in learner.latentVariables()]) return latent_edges except Exception as e: _logger.error(str(e)) print(str(e)) return None
def test_dbWithGuil(self): filename = self.agrumSrcDir('csv_quoted.csv') with open(filename, "w") as src: src.write("""X,Y,Z 0,1,2 0,1",0 0,0,2 1,"0,2 0,"1",1 1,1,1 0,1,1 """) with self.assertRaises(SyntaxError): learner = gum.BNLearner(filename) learner.useScoreBIC() learner.useGreedyHillClimbing() bn = learner.learnBN()
def testHillClimbingAccurate(self): learner = gum.BNLearner(self.agrumSrcDir('asia.csv')) witness = [ 'smoking?', 'lung_cancer?', 'bronchitis?', 'visit_to_Asia?', 'tuberculosis?', 'tuberculos_or_cancer?', 'dyspnoea?', 'positive_XraY?' ] for n in witness: self.assertTrue(n in learner.names()) for n in learner.names(): self.assertTrue(n in witness) learner.useGreedyHillClimbing() bn = learner.learnBN() ref = gum.loadBN(self.agrumSrcDir('asia2.bif'), verbose=False) f = gum.ExactBNdistance(bn, ref) res = f.compute() self.assertAlmostEqual(res['klPQ'], 0.5, delta=0.5)
def learn_bn(filepath, learn_algo): try: learner = agrum.BNLearner(filepath) except IOError as e: print("learn_and_create_bn - I/O error({0}): {1}".format( e.errno, e.strerror)) exit ''' Select learning method ''' if learn_algo == 'hand-coded': learner.addMandatoryArc(0, 2) learner.addMandatoryArc(1, 2) if sim_param.distance_param: learner.addMandatoryArc(3, 2) elif learn_algo == 'hillclimbing': learner.useGreedyHillClimbing() elif learn_algo == 'tabu': learner.useLocalSearchWithTabuList() elif learn_algo == 'k2': learner.useK2([3, 2, 1, 0]) else: print( 'ERROR - learn_bn : there was a problem while selecting the learner' ) sys.exit() ''' Select score (BDEU by default)''' if sim_param.score_likelihood: learner.useScoreLog2Likelihood() learner.setMaxIndegree(2) if sim_param.score_bic: learner.useScoreBIC if sim_param.score_aic: learner.useScoreAIC bn = learner.learnBN() print("BN learned.\n", bn) return bn
def test_dirichlet(self): bn = gum.fastBN("A->B<-C->D->E<-B") gum.generateSample(bn, 2000, self.agrumSrcDir("dirichlet.csv"), with_labels=True) bn2 = gum.fastBN("A->B->C->D->E") gum.generateSample(bn2, 2000, self.agrumSrcDir("database.csv"), with_labels=True) # bn is used to give the variables and their domains learner = gum.BNLearner(self.agrumSrcDir("database.csv"), bn) learner.useDirichletPrior(self.agrumSrcDir("dirichlet.csv"), 10) learner.useScoreAIC( ) # or another score with no included prior such as BDeu bn3 = learner.learnBN() self.assertEqual(bn.size(), 5)
def learnDAG(sample, dis_method='quantile', nbins=5, threshold=25): # data = pd.read_csv(file_name, nrows=size) names = list(sample.getDescription()) csvfile = tf.NamedTemporaryFile(delete=False) csvfilename = csvfile.name + '.csv' csvfile.close() sample.exportToCSVFile(csvfilename, ',') start = time.time() discretizer = skbn.BNDiscretizer(defaultDiscretizationMethod=dis_method, defaultNumberOfBins=nbins, discretizationThreshold=threshold) variables = [ discretizer.createVariable(name, sample.getMarginal([name])) for name in names ] bn = gum.BayesNet() for variable in variables: bn.add(variable) learner = gum.BNLearner(csvfilename, bn) learner.useMIIC() learner.useNMLCorrection() dag = learner.learnDAG() ndag = otagr.NamedDAG(dag, names) end = time.time() os.remove(csvfilename) return ndag, start, end
def test_loglikelihood(self): learner = gum.BNLearner(self.agrumSrcDir('chi2.csv')) self.assertEqual(learner.nbRows(), 500) self.assertEqual(learner.nbCols(), 4) siz = -1.0 * learner.nbRows() stat = learner.logLikelihood(["A"]) / siz # LL=-N.H self.assertAlmostEqual(stat, 0.99943499, delta=1e-5) stat = learner.logLikelihood(["B"]) / siz # LL=-N.H self.assertAlmostEqual(stat, 0.9986032, delta=1e-5) stat = learner.logLikelihood(["A", "B"]) / siz # LL=-N.H self.assertAlmostEqual(stat, 1.9668973, delta=1e-5) stat = learner.logLikelihood(["A"], ["B"]) / siz # LL=-N.H self.assertAlmostEqual(stat, 1.9668973 - 0.9986032, delta=1e-5) stat = learner.logLikelihood(["C"]) / siz # LL=-N.H self.assertAlmostEqual(stat, 0.99860302, delta=1e-5) stat = learner.logLikelihood(["D"]) / siz # LL=-N.H self.assertAlmostEqual(stat, 0.40217919, delta=1e-5) stat = learner.logLikelihood(["C", "D"]) / siz # LL=-N.H self.assertAlmostEqual(stat, 1.40077995, delta=1e-5) stat = learner.logLikelihood(["C"], ["D"]) / siz # LL=-N.H self.assertAlmostEqual(stat, 1.40077995 - 0.40217919, delta=1e-5)
def predicted_val(dataframe, dbn, predicted_var_0, unique_time_series, var_ts, valid_file, training_file, variable_0, read_var_0, global_pred_rate, averages): with open("Predicted_Values_Optimized.csv", "w", newline='') as fp: header = ["parcelle", "sequence"] for variable in predicted_var_0: for i in range(1, 3): variable_t = variable + "_" + str(i) header.append(variable_t) w = csv.writer(fp) w.writerow(header) for index, ts in enumerate(unique_time_series): valid_df = dataframe[dataframe[var_ts] == ts].copy() valid_df = valid_df.reset_index() training_df = dataframe[dataframe[var_ts] != ts].copy() training_df = training_df.reset_index() valid_df.to_csv(valid_file, index=False) training_df.to_csv(training_file, index=False) learner = gum.BNLearner(training_file, dbn) learner.setInitialDAG(dbn.dag()) learner.useScoreLog2Likelihood() learner.useAprioriSmoothing(0.01) dbn_2 = learner.learnParameters(dbn.dag()) steps = 3 bn = gdyn.unroll2TBN(dbn_2, steps) # bn_to_pdf(bn, "DBN_unrolled_2.pdf") dictionary = dict() for variable in variable_0: tempdf = valid_df.loc[0, :] dictionary[variable] = int(tempdf[variable]) original_values = dict() for variable in predicted_var_0: original_values[variable] = [] global_pred_rate[index] = dict() pred_row = list(valid_df[["parcelle", "sequence"]].loc[0, :]) for recorder in range(0, steps - 1): for variable in read_var_0: row = valid_df.loc[recorder, :] variable_name = variable + "_" + str(recorder + 1) var_name_in_df = variable + "_t" dictionary[variable_name] = int(row[var_name_in_df]) for variable in predicted_var_0: var_name_in_df = variable + "_t" original_values[variable].append(int(row[var_name_in_df])) inference = gum.LazyPropagation(bn) inference.setEvidence(dictionary) inference.makeInference() pred_values = dict() predicted_values = dict() for variable in predicted_var_0: predicted_values[variable] = [] for variable in predicted_var_0: pred_values[variable] = dict() for i in range(1, steps): variable_t = variable + "_" + str(i) predicted_values = [] predicted_values = inference.posterior( bn.idFromName(variable_t))[:] index_used = original_values[variable][i - 1] t_list = [0] * len(predicted_values) t_list[index_used] = 1 pred_values[variable][variable_t] = deprobabilize( averages[variable], predicted_values) pred_row.append(pred_values[variable][variable_t]) w.writerow(pred_row) # gnb.showPotential(bn.cpt("s_2")) return
template=gum.BayesNet() template.add(gum.LabelizedVariable("target", "target", ['<=50K', '>50K'])) template.add(gum.LabelizedVariable("sex", "sex",['Male','Female'])) template.add(gum.LabelizedVariable("age_range", "age_range",['0-20','21-30','31-65','66-90'])) template.add(gum.LabelizedVariable("race", "race",['White', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Other', 'Black'])) template.add(gum.LabelizedVariable("workclass", "workclass",['Private', 'Self-emp-not-inc', 'Self-emp-inc', 'Federal-gov', 'Local-gov', 'State-gov', 'Without-pay', 'Never-worked'])) template.add(gum.LabelizedVariable("relationship", "relationship", ['Wife', 'Own-child', 'Husband', 'Not-in-family', 'Other-relative', 'Unmarried'])) template.add(gum.LabelizedVariable("marital_status", "marital_status", ['Married-civ-spouse', 'Divorced', 'Never-married', 'Separated', 'Widowed', 'Married-spouse-absent', 'Married-AF-spouse'])) template.add(gum.LabelizedVariable("occupation", "occupation",['Tech-support', 'Craft-repair', 'Other-service', 'Sales', 'Exec-managerial', 'Prof-specialty', 'Handlers-cleaners', 'Machine-op-inspct', 'Adm-clerical', 'Farming-fishing', 'Transport-moving', 'Priv-house-serv', 'Protective-serv', 'Armed-Forces'])) gnb.showBN(template) train_df.to_csv(os.path.join('/content/gdrive/My Drive/train_data2.csv'), index=False) file = os.path.join('res', 'titanic', '/content/gdrive/My Drive/train_data2.csv') learner = gum.BNLearner(file, template) bn = learner.learnBN() bn gnb.showInformation(bn,{},size="20") gnb.showInference(bn) gnb.showPosterior(bn,evs={"sex": "Male", "age_range": '21-30'},target='target') gnb.sideBySide(bn, gum.MarkovBlanket(bn, 'target'), captions=["Learned Bayesian Network", "Markov blanket of 'target'"]) ie=gum.LazyPropagation(bn) init_belief(ie)