def testLogit(self): bn = gum.BayesNet() age = bn.add(gum.RangeVariable("age", "", 35, 67)) taux = bn.add(gum.RangeVariable("taux", "", 115, 171)) angine = bn.add(gum.LabelizedVariable("angine", "")) vc = gum.LabelizedVariable("coeur", "", 0) vc.addLabel("NON").addLabel("OUI") coeur = bn.addLogit(vc, 14.4937) bn.addWeightedArc(age, coeur, -0.1256) bn.addWeightedArc(taux, coeur, -0.0636) bn.addWeightedArc(angine, coeur, 1.779) witness_age = ("50", "49", "46", "49", "62", "35", "67", "65", "47") witness_taux = ("126", "126", "144", "139", "154", "156", "160", "140", "143") witness_angine = ("1", "0", "0", "0", "1", "1", "0", "0", "0") witness_coeur = ("OUI", "OUI", "OUI", "OUI", "OUI", "OUI", "NON", "NON", "NON") witness_proba = (0.8786, 0.5807, 0.3912, 0.3773, 0.2127, 0.8760, 1 - 0.0163, 1 - 0.0710, 1 - 0.3765) inst = gum.Instantiation(bn.cpt(coeur)) for i in range(len(witness_age)): inst.chgVal(bn.variable(age), bn.variable(age)[witness_age[i]]) inst.chgVal(bn.variable(taux), bn.variable(taux)[witness_taux[i]]) inst.chgVal(bn.variable(angine), bn.variable(angine)[witness_angine[i]]) inst.chgVal(bn.variable(coeur), bn.variable(coeur)[witness_coeur[i]]) self.assertAlmostEqual(bn.cpt(coeur).get(inst), witness_proba[i], places=3)
def testLabelsOfVars(self): v = gum.LabelizedVariable("a", "a") self.assertEqual(v.labels(), ("0", "1")) self.assertNotEqual(v.labels(), ("1", "0")) v = gum.LabelizedVariable( "b", "b", 0).addLabel("toto").addLabel("titi").addLabel("yes") self.assertEqual(v.labels(), ("toto", "titi", "yes")) v = gum.RangeVariable("c", "c", 0, 5) self.assertEqual(v.labels(), ("0", "1", "2", "3", "4", "5")) v = gum.RangeVariable("d", "d", 3, 5) self.assertEqual(v.labels(), ("3", "4", "5")) v = gum.DiscretizedVariable( "e", "e").addTick(1).addTick(2).addTick(3).addTick(4) self.assertEqual(v.labels(), ("[1;2[", "[2;3[", "[3;4]")) v = gum.DiscretizedVariable( "e", "e").addTick(1).addTick(2).addTick(3).addTick(4) v.setEmpirical(True) self.assertEqual(v.labels(), ("(1;2[", "[2;3[", "[3;4)")) v = gum.DiscretizedVariable("f", "f", [1, 5, 2, 4]) self.assertEqual(v.labels(), ("[1;2[", "[2;4[", "[4;5]")) v = gum.DiscretizedVariable("f", "f", [1, 5, 2, 4]) v.setEmpirical(True) self.assertEqual(v.labels(), ("(1;2[", "[2;4[", "[4;5)"))
def testHashableDiscreteVariable(self): va = gum.LabelizedVariable("a", "a") vb = gum.LabelizedVariable("b", "b", ["toto", "titi", "yes"]) vc = gum.RangeVariable("c", "c", 0, 5) vd = gum.RangeVariable("d", "d", 3, 5) ve = gum.DiscretizedVariable( "e", "e").addTick(1).addTick(2).addTick(3).addTick(4) s = set([va, vb, vc] + [vc, vd, ve] + [va, ve]) self.assertEqual(len(s), 5)
def testCopyConstructor(self): var1 = gum.RangeVariable("var 1", "this is var 1") self.assertEqual(var1.varType(), gum.VarType_Range) var2 = gum.RangeVariable("var 2", "this is var 2", 1, 4) var3 = gum.RangeVariable(var1) self.assertEqual(var3.minVal(), var1.minVal()) self.assertEqual(var3.maxVal(), var1.maxVal()) self.assertNotEqual(var1.maxVal(), var2.maxVal())
def testAddDummyVariables(self): # === LabelizedVariable v = gum.LabelizedVariable("v", "v", 0) p = gum.Potential() self.assertEqual(v.domainSize(), 0) with self.assertRaises(gum.InvalidArgument): p.add(v) v.addLabel("first") self.assertEqual(v.domainSize(), 1) p.add(v) p = gum.Potential() v.addLabel("second") self.assertEqual(v.domainSize(), 2) p.add(v) # === RangeVariable v = gum.RangeVariable("v", "v", 1, 0) p = gum.Potential() self.assertEqual(v.domainSize(), 0) with self.assertRaises(gum.InvalidArgument): p.add(v) v.setMaxVal(1) self.assertEqual(v.domainSize(), 1) p.add(v) p = gum.Potential() v.setMaxVal(2) self.assertEqual(v.domainSize(), 2) p.add(v) # === DiscretizedVariable v = gum.DiscretizedVariable("v", "v") p = gum.Potential() self.assertEqual(v.domainSize(), 0) with self.assertRaises(gum.InvalidArgument): p.add(v) v.addTick(1) self.assertEqual(v.domainSize(), 0) with self.assertRaises(gum.InvalidArgument): p.add(v) v.addTick(2) self.assertEqual(v.domainSize(), 1) p.add(v) p = gum.Potential() v.addTick(3) self.assertEqual(v.domainSize(), 2) p.add(v)
def testFastBuilders(self): bn1 = gum.BayesNet() bn1.add(gum.RangeVariable("A", "A", 0, 2)) bn1.add(gum.LabelizedVariable("B", "B", ["a", "b", "c"])) bn1.add(gum.RangeVariable("C", "C", 4, 7)) bn1.add(gum.IntegerVariable("D", "D", [1, 3, 10])) bn1.add(gum.DiscretizedVariable("E", "E", [1, 1.5, 3, 3.14, 15])) bn1.addArc("A", "B") bn1.addArc("B", "C") bn1.addArc("C", "D") bn1.addArc("D", "E") self.assertEquals(len(bn1.check()), 5) # every cpt is faulty bn2 = gum.fastBN("A->B{a|b|c}->C[4,7]->D{1|3|10}->E[1,1.5,3,3.14,15]", 3) self.assertEquals(len(bn2.check()), 0) # but random bn3 = gum.BayesNet() bn3.add("A", 3) bn3.add("B{a|b|c}") bn3.add("C[4,7]") bn3.add("D{1|3|10}") bn3.add("E[1,1.5,3,3.14,15]") bn3.addArc("A", "B") bn3.addArc("B", "C") bn3.addArc("C", "D") bn3.addArc("D", "E") self.assertEquals(len(bn3.check()), 5) # every cpt is faulty bn4 = gum.BayesNet() bn4.addVariables( ["A", "B{a|b|c}", "C[4,7]", "D{1|3|10}", "E[1,1.5,3,3.14,15]"], 3) bn4.addArcs([("A", "B"), ("B", "C"), ("C", "D"), ("D", "E")]) self.assertEquals(len(bn4.check()), 5) # every cpt is faulty for name in "ABCDE": self.assertEquals(bn1.variable(name), bn2.variable(name)) self.assertEquals(bn1.variable(name), bn3.variable(name)) self.assertEquals(bn1.variable(name), bn4.variable(name))
def testLabels(self): var1 = gum.RangeVariable("var 1", "this is var 1") self.assertEqual(var1.domainSize(), 2) self.assertFalse(var1.empty()) var1.setMinVal(1) var1.setMaxVal(0) self.assertTrue(var1.empty()) var1.setMaxVal(9) self.assertFalse(var1.empty()) self.assertEqual(var1.domainSize(), 9) self.assertTrue(var1.belongs(3)) self.assertFalse(var1.belongs(0)) self.assertFalse(var1.belongs(10)) self.assertEqual(var1.label(1), "2") self.assertEqual(var1["2"], 1)
def testReadAfterWrite(self): bn = gum.BayesNet() bn.add(gum.RangeVariable("1", "", 0, 1)) bn.add( gum.DiscretizedVariable("2", "").addTick(0.0).addTick(0.5).addTick(1.0)) bn.add(gum.LabelizedVariable("3", "", 2)) bn.add(gum.LabelizedVariable("4", "", 2)) bn.add(gum.LabelizedVariable("5", "", 3)) bn.addArc("1", "3") bn.addArc("1", "4") bn.addArc("3", "5") bn.addArc("4", "5") bn.addArc("2", "4") bn.addArc("2", "5") bn.cpt("1").fillWith([0.2, 0.8]) bn.cpt("2").fillWith([0.3, 0.7]) bn.cpt("3").fillWith([0.1, 0.9, 0.9, 0.1]) bn.cpt("4").fillWith([0.4, 0.6, 0.5, 0.5, 0.5, 0.5, 1.0, 0.0]) bn.cpt("5").fillWith([ 0.3, 0.6, 0.1, 0.5, 0.5, 0.0, 0.5, 0.5, 0.0, 1.0, 0.0, 0.0, 0.4, 0.6, 0.0, 0.5, 0.5, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 1.0 ]) gum.saveBN(bn, self.agrumSrcDir("o3prm/BNO3PRMIO_file.o3prm")) bn2 = gum.loadBN(self.agrumSrcDir("o3prm/BNO3PRMIO_file.o3prm"), system="bayesnet") self.assertEqual(bn.dim(), bn2.dim()) self.assertEqual(bn.log10DomainSize(), bn2.log10DomainSize()) for n in bn.names(): self.assertEqual(bn.variable(n).name(), bn2.variable(n).name()) self.assertEqual( bn.variable(n).varType(), bn2.variable(n).varType()) self.assertEqual( bn.variable(n).domainSize(), bn2.variable(n).domainSize())
def test_fromMarginal(): print("\n** FromRangeVariable") x = gum.RangeVariable("x", "x", 3, 10) px = gum.Potential().add(x).fillWith([1, 2, 3, 4, 5, 6, 7, 8]) dx = otagrum.Utils.FromMarginal(px) print(px.normalize()) print(dx) print("\n** From LabelizedVariable") y = gum.LabelizedVariable( "y", "y", 0).addLabel("True").addLabel("Maybe").addLabel("False") py = gum.Potential().add(y).fillWith([2, 8, 4]).normalize() print(py) print(otagrum.Utils.FromMarginal(py)) print("\n** From LabelizedVariable but numerical") y = gum.LabelizedVariable("y", "y", 0).addLabel("1").addLabel("1.5").addLabel("3.15") py = gum.Potential().add(y).fillWith([2, 8, 4]).normalize() print(py) print(otagrum.Utils.FromMarginal(py))
def testLoopIn(self): u = gum.LabelizedVariable("u", "u", 4) v = gum.RangeVariable("v", "v", 1, 5) w = gum.DiscretizedVariable("w", "w", [-2, -0.5, 1, 2.5]) p = gum.Potential().fillWith(0) s = 0 for i in p.loopIn(): s += p.get(i) self.assertEqual(s, 0) p = gum.Potential().fillWith(42) s = 0 for i in p.loopIn(): s += p.get(i) self.assertEqual(s, 42) p.add(u).add(v).add(w) for i in p.loopIn(): p.set(i, random.choice([1, 2, 3])) s = 0 for i in p.loopIn(): s += p.get(i) self.assertEqual(s, p.sum())
def fit(self, X=None, y=None, data=None, targetName=None,filename=None): """ parameters: X: {array-like, sparse matrix} of shape (n_samples, n_features) training data. Warning: Raises ValueError if either filename or targetname is not None. Raises ValueError if y is None. y: array-like of shape (n_samples) Target values. Warning: Raises ValueError if either filename or targetname is not None. Raises ValueError if X is None data: Union[str,pandas.DataFrame] the source of training data : csv filename or pandas.DataFrame. targetName is mandatory to find the class in this source. targetName: str specifies the name of the targetVariable in the csv file. Warning: Raises ValueError if either X or y is not None. Raises ValueError if filename is None. filename: str (deprecated, use data instead) specifies the csv file where the training data and target values are located. Warning: Raises ValueError if either X or y is not None. Raises ValueError if targetName is None returns: void Fits the model to the training data provided. The two possible uses of this function are `fit(X,y)` and `fit(data=..., targetName=...)`. Any other combination will raise a ValueError """ if filename is not None: print("**pyAgrum** : 'filename' is deprecated since 1.1.1. Please use 'data' instead.") if data is None: data = filename if data is None: if targetName is not None: raise ValueError( "This function should be used either as fit(X,y) or fit(data=...,targetAttribute=...). You have set " "data to None, but have entered a targetName") if X is None or y is None: raise ValueError( "This function should be used either as fit(X,y) or fit(data=...,targetAttribute=...). You have not " "entered a data source (filename or pandas.DataFrame) and not specified the X and y matrices that should be used") else: if targetName is None: raise ValueError( "This function should be used either as fit(X,y) or fit(data=...,targetAttribute=...). The name of the " "target must be specified if using this function with data containing a csv filename or a pandas.DataFrame.") if X is not None or y is not None: raise ValueError( "This function should be used either as fit(X,y) or fit(data=...,targetAttribute=...). You can not give " "a data and the X and y matrices at the same time.") if type(data)==str: X, y = self.XYfromCSV(data, True, targetName) else: # pandas.DataFrame y = data[targetName] X = data.drop(targetName, axis=1) self.fromModel = False variableNames = None self.discretizer.clear() if isinstance(y, pandas.DataFrame): # type(y) == pandas.DataFrame: self.target = y.columns.tolist()[0] if checkInt(self.target): self.target = "Y" elif type(y) == pandas.core.series.Series: self.target = y.name else: self.target = 'y' if isinstance(X, pandas.DataFrame): # type(X) == pandas.DataFrame: variableNames = [f"X{x}" if checkInt(x) else x for x in X.columns] # verifies the shape of the two arrays X, y = sklearn.utils.check_X_y(X, y, dtype=None, accept_sparse=True) d = X.shape[1] if variableNames is None: variableNames = ["x" + str(i) for i in range(d)] self.variableNameIndexDictionary = dict() for i in range(d): self.variableNameIndexDictionary[variableNames[i]] = i self.targetType = y.dtype possibleValuesY = numpy.unique(y) if len(possibleValuesY) == 1: raise ValueError( "There is only 1 possible values for Y in the data provided") if len(possibleValuesY) > 10: warnings.warn( f"A classifier with too many possible values for Y (here : {possibleValuesY}) in the data provided is not meaningfull (" "please use regression methods instead).") self.isBinaryClassifier = (len(possibleValuesY) == 2) self.bn = gum.BayesNet('Template') var = gum.LabelizedVariable(self.target, self.target, 0) is_int_varY = True min_vY = max_vY = None for value in possibleValuesY: if not checkInt(value): is_int_varY = False break else: v = int(value) if min_vY is None or min_vY > v: min_vY = v if max_vY is None or max_vY < v: max_vY = v if is_int_varY: if len(possibleValuesY) == max_vY - min_vY + 1: # no hole in the list of int var = gum.RangeVariable(self.target, self.target, min_vY, max_vY) else: var = gum.IntegerVariable(self.target, self.target, [int(v) for v in possibleValuesY]) else: var = gum.LabelizedVariable(self.target, self.target, [str(v) for v in possibleValuesY]) self.bn.add(var) for i in range(d): var = self.discretizer.createVariable( variableNames[i], X[:, i], y, possibleValuesY) self.bn.add(var) csvfile = tempfile.NamedTemporaryFile(delete=False) tmpfilename = csvfile.name csvfilename = tmpfilename + ".csv" csvfile.close() CSV(X, y, self.target, self.variableNameIndexDictionary, csvfilename) self.learner = gum.BNLearner(csvfilename, self.bn) IPrior(self.aPriori, self.learner, self.aPrioriWeight, self.DirichletCsv) if self.learningMethod == 'NaiveBayes': self.bn = BN_fitNaiveBayes( X, y, self.bn, self.learner, variableNames, self.target, self.constraints) elif self.learningMethod == 'TAN': self.bn = BN_fitTAN(X, y, self.bn, self.learner, variableNames, self.target) elif self.learningMethod == 'Chow-Liu': self.bn = BN_fitChowLiu(X, y, self.bn, self.learner, variableNames, self.target) else: self.bn = BN_fitStandard(X, y, self.learner, self.learningMethod, self.possibleSkeleton, self.scoringType, self.constraints) self.label = self.bn.variableFromName(self.target).labels()[1] self.MarkovBlanket = compileMarkovBlanket(self.bn, self.target) if self.isBinaryClassifier: self.threshold = CThreshold( self.MarkovBlanket, self.target, csvfilename, self.usePR, self.significant_digit) os.remove(csvfilename) os.remove(tmpfilename)
def createVariable(self, variableName, X, y=None, possibleValuesY=None): """ parameters: variableName: the name of the created variable X: ndarray shape(n,1) A column vector containing n samples of a feature. The column for which the variable will be created y: ndarray shape(n,1) A column vector containing the corresponding for each element in X. possibleValuesX: onedimensional ndarray An ndarray containing all the unique values of X possibleValuesY: onedimensional ndarray An ndarray containing all the unique values of y returnModifiedX: bool X could be modified by this function during returns: var: pyagrum.DiscreteVariable the created variable Creates a variable for the column passed in as a parameter and places it in the Bayesian network """ if y is not None: X, y = sklearn.utils.check_X_y(X, y, dtype=None, accept_sparse=True, ensure_2d=False) X = sklearn.utils.check_array(X, dtype=None, ensure_2d=False) try: Xtransformed = sklearn.utils.check_array(X, dtype='float', ensure_2d=False) isNumeric = True except ValueError: Xtransformed = X isNumeric = False possibleValuesX = numpy.unique(X) n = len(X) if variableName not in self.discretizationParametersDictionary.keys(): # The user has not manually set the discretization parameters for this variable if isNumeric and \ ((self.discretizationThreshold >= 1 and len(possibleValuesX) > self.discretizationThreshold) or (self.discretizationThreshold < 1 and len(possibleValuesX) / len(X) > self.discretizationThreshold)): self.discretizationParametersDictionary[variableName] = dict() self.discretizationParametersDictionary[variableName]['methode'] = self.defaultMethod self.discretizationParametersDictionary[variableName]['k'] = self.defaultNbBins else: self.discretizationParametersDictionary[variableName] = dict() self.discretizationParametersDictionary[variableName]['methode'] = "NoDiscretization" usingDefaultParameters = True else: usingDefaultParameters = False if self.discretizationParametersDictionary[variableName]['methode'] != "NoDiscretization" and not isNumeric: raise ValueError("The variable " + variableName + " is not numeric and cannot be discretized!") if self.discretizationParametersDictionary[variableName]["methode"] == "NoDiscretization": is_int_var=True min_v=max_v=None for value in possibleValuesX: if not checkInt(value): is_int_var=False break else: v=int(value) if min_v is None or min_v>v: min_v=v if max_v is None or max_v<v: max_v=v if is_int_var: if len(possibleValuesX)==max_v-min_v+1: # no hole in the list of int var =gum.RangeVariable(variableName, variableName, min_v,max_v) else: var=gum.IntegerVariable(variableName, variableName,[int(v) for v in possibleValuesX]) else: var = gum.LabelizedVariable(variableName, variableName, [str(v) for v in possibleValuesX]) else: self.numberOfContinous += 1 if self.discretizationParametersDictionary[variableName]['methode'] == "CAIM": if y is None: raise ValueError( "The CAIM discretization method requires a list of the associated classes for each data vector since it " "is a supervised discretization method. You should pass it as y.") if possibleValuesY is None: possibleValuesY = numpy.unique(y) binEdges = self.discretizationCAIM(Xtransformed.reshape(n, 1), y.reshape(n, 1), numpy.unique(Xtransformed), possibleValuesY) elif self.discretizationParametersDictionary[variableName]['methode'] == "MDLP": if y is None: raise ValueError( "The MDLP discretization method requires a list of the associated classes for each data vector since it " "is a supervised discretization method. You should pass it as y.") if possibleValuesY is None: possibleValuesY = numpy.unique(y) binEdges = self.discretizationMDLP(Xtransformed.reshape(n, 1), y.reshape(n, 1), numpy.unique(Xtransformed), possibleValuesY) elif self.discretizationParametersDictionary[variableName]['methode'] == "NML": binEdges = self.discretizationNML(Xtransformed.flatten(), numpy.unique(Xtransformed), kMax=self.discretizationParametersDictionary[variableName]["k"]) else: if self.discretizationParametersDictionary[variableName]['k'] == 'elbowMethod': binEdges = self.discretizationElbowMethodRotation( self.discretizationParametersDictionary[variableName]['methode'], Xtransformed.flatten()) else: discre = skp.KBinsDiscretizer(self.discretizationParametersDictionary[variableName]['k'], strategy=self.discretizationParametersDictionary[variableName]['methode']) discre.fit(X.reshape(-1, 1)) binEdges = discre.bin_edges_[0].tolist() if len(binEdges) == 2: raise ValueError("Due to an error the discretization method " + str( self.discretizationParametersDictionary[variableName]['methode']) + " using " + str( self.discretizationParametersDictionary[variableName]['k']) + " bins for the variable " + str( variableName) + "gave only 1 bin. Try increasing the number of bins used by this variable using " "setDiscetizationParameters to avoid this error") #we replace infinity as min and max by the new empirical flag. #binEdges[0] = -math.inf #binEdges[-1] = math.inf self.totalNumberOfBins += len(binEdges) - 1 var = gum.DiscretizedVariable(variableName, variableName, binEdges) var.setEmpirical(True) if usingDefaultParameters: self.discretizationParametersDictionary.pop(variableName) return var
def setUp(self): self.bn = gum.BayesNet() self.c, self.r = \ [self.bn.add(gum.LabelizedVariable(name, name, 2)) for name in 'c r'.split()] self.s, self.w = \ [self.bn.add(gum.LabelizedVariable(name, name, 0).addLabel('no') \ .addLabel('yes')) for name in 's w'.split()] for link in [(self.c, self.s), (self.c, self.r), (self.s, self.w), (self.r, self.w)]: self.bn.addArc(*link) self.bn.cpt(self.c)[:] = [0.5, 0.5] self.bn.cpt(self.s)[:] = [[0.5, 0.5], [0.9, 0.1]] self.bn.cpt(self.r)[:] = [[0.8, 0.2], [0.2, 0.8]] self.bn.cpt(self.w)[0, 0, :] = [1, 0] self.bn.cpt(self.w)[0, 1, :] = [0.1, 0.9] self.bn.cpt(self.w)[1, 0, :] = [0.1, 0.9] self.bn.cpt(self.w)[1, 1, :] = [0.01, 0.99] self.bni = gum.BayesNet() self.ci, self.si = \ [self.bni.add(gum.LabelizedVariable(name, name, 2)) for name in 'ci si'.split()] self.ri = self.bni.add(gum.RangeVariable('ri', '', 5, 6)) vwi = gum.DiscretizedVariable('wi', '') vwi.addTick(0.2).addTick(0.4).addTick(0.6) self.wi = self.bni.add(vwi) for link in [(self.ci, self.si), (self.ci, self.ri), (self.si, self.wi), (self.ri, self.wi)]: self.bni.addArc(*link) self.bni.cpt(self.ci)[:] = [0.5, 0.5] self.bni.cpt(self.si)[:] = [[0.5, 0.5], [0.9, 0.1]] self.bni.cpt(self.ri)[:] = [[0.8, 0.2], [0.2, 0.8]] self.bni.cpt(self.wi)[0, 0, :] = [1, 0] self.bni.cpt(self.wi)[0, 1, :] = [0.1, 0.9] self.bni.cpt(self.wi)[1, 0, :] = [0.1, 0.9] self.bni.cpt(self.wi)[1, 1, :] = [0.01, 0.99] self.bn2 = gum.BayesNet() self.s2, self.r2, self.w2 = \ [self.bn2.add(gum.LabelizedVariable(name, name, 2)) for name in 's2 r2 w2'.split()] for link in [(self.r2, self.s2), (self.s2, self.w2), (self.r2, self.w2)]: self.bn2.addArc(*link) self.bn2.cpt(self.s2)[:] = [[0.6, 0.4], [0.99, 0.01]] self.bn2.cpt(self.r2)[:] = [0.8, 0.2] self.bn2.cpt(self.w2)[0, 0, :] = [1, 0] self.bn2.cpt(self.w2)[0, 1, :] = [0.1, 0.9] self.bn2.cpt(self.w2)[1, 0, :] = [0.2, 0.8] self.bn2.cpt(self.w2)[1, 1, :] = [0.01, 0.99]