def test_predict(self): titanic = BayesianModel() titanic.add_edges_from([("Sex", "Survived"), ("Pclass", "Survived")]) titanic.fit(self.titanic_data2[500:]) p1 = titanic.predict(self.titanic_data2[["Sex", "Pclass"]][:30]) p2 = titanic.predict(self.titanic_data2[["Survived", "Pclass"]][:30]) p3 = titanic.predict(self.titanic_data2[["Survived", "Sex"]][:30]) p1_res = np.array([ '0', '1', '0', '1', '0', '0', '0', '0', '0', '1', '0', '1', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0' ]) p2_res = np.array([ 'male', 'female', 'female', 'female', 'male', 'male', 'male', 'male', 'female', 'female', 'female', 'female', 'male', 'male', 'male', 'female', 'male', 'female', 'male', 'female', 'male', 'female', 'female', 'female', 'male', 'female', 'male', 'male', 'female', 'male' ]) p3_res = np.array([ '3', '1', '1', '1', '3', '3', '3', '3', '1', '1', '1', '1', '3', '3', '3', '1', '3', '1', '3', '1', '3', '1', '1', '1', '3', '1', '3', '3', '1', '3' ]) np_test.assert_array_equal(p1.values.ravel(), p1_res) np_test.assert_array_equal(p2.values.ravel(), p2_res) np_test.assert_array_equal(p3.values.ravel(), p3_res)
def bayesnet_examples(): from pgmpy.factors import TabularCPD from pgmpy.models import BayesianModel import pandas as pd student_model = BayesianModel([('D', 'G'), ('I', 'G'), ('G', 'L'), ('I', 'S')]) # we can generate some random data. raw_data = np.random.randint(low=0, high=2, size=(1000, 5)) data = pd.DataFrame(raw_data, columns=['D', 'I', 'G', 'L', 'S']) data_train = data[: int(data.shape[0] * 0.75)] student_model.fit(data_train) student_model.get_cpds() data_test = data[int(0.75 * data.shape[0]): data.shape[0]] data_test.drop('D', axis=1, inplace=True) student_model.predict(data_test) grade_cpd = TabularCPD( variable='G', variable_card=3, values=[[0.3, 0.05, 0.9, 0.5], [0.4, 0.25, 0.08, 0.3], [0.3, 0.7, 0.02, 0.2]], evidence=['I', 'D'], evidence_card=[2, 2]) difficulty_cpd = TabularCPD( variable='D', variable_card=2, values=[[0.6, 0.4]]) intel_cpd = TabularCPD( variable='I', variable_card=2, values=[[0.7, 0.3]]) letter_cpd = TabularCPD( variable='L', variable_card=2, values=[[0.1, 0.4, 0.99], [0.9, 0.6, 0.01]], evidence=['G'], evidence_card=[3]) sat_cpd = TabularCPD( variable='S', variable_card=2, values=[[0.95, 0.2], [0.05, 0.8]], evidence=['I'], evidence_card=[2]) student_model.add_cpds(grade_cpd, difficulty_cpd, intel_cpd, letter_cpd, sat_cpd)
def kFold_cross_validation_bayesian(X, y, splits=10): """ cross-validation per la rete bayesiana :param X: X dataframe - valori noti :param y: y column(s) - valori da predire :param splits: numero di folds da utilizzare :return: valore medio di accuracy """ folds = KFold_splitting(X, y, splits) scores = [] for fold in folds: model = BayesianModel([('fat_value', 'saturated-fat_value'), ('carbohydrates_value', 'sugars_value'), ('proteins_value', 'salt_value'), ('fat_value', 'energy_value'), ('carbohydrates_value', 'energy_value'), ('salt_value', 'nutri_value'), ('energy_value', 'nutri_value'), ('saturated-fat_value', 'nutri_value'), ('sugars_value', 'nutri_value')]) predict_data = fold[1].copy() real_data = fold[3].copy() X['nutri_value'] = y model.fit(X, estimator=BayesianEstimator, prior_type="BDeu") y_pred = model.predict(predict_data) scores.append(accuracy_score(y_pred, real_data)) avg_scores = statistics.mean(scores) std_scores = statistics.stdev(scores) print('Accuracy: %.3f (Standard Dev: %.3f)' % (avg_scores, std_scores)) return avg_scores
def bayeSian(): fileName = '文件地址'; dataMat, dataLab = file2matrix(fileName, 9); count = 0; # testMat = dataMat[]; testLab = np.array(dataLab[]); # testFra = pd.DataFrame(testMat, columns=[columns_name]); trainFraK = pd.DataFrame(dataMat,columns=[columns_name]); trainInput = trainFraK[[]]; trainArr = np.zeros((dataMat.shape[0], 3), dtype='int64'); for arr in trainInput.values: trainArr[count, :]= map(int, arr); count += 1; trainInput = pd.DataFrame(trainArr, columns=[]); test = trainInput[]; test = test.copy(); test.drop('T_TYPE', axis=1, inplace=True); model = BayesianModel([(), ()]); model.fit(trainInput.ix[]); labelPre = model.predict(test); count = 0; print labelPre.values.shape; for i in range(len(labelPre)): if labelPre.values[i]==testLab[i]: count += 1; print '准确度:', float(count)/len(testLab);
class BN: def __init__(self, DAG): self.data = [] self.model = BayesianModel(DAG) def take_only_relevant_features(self, DAG, db_file): all_data = pd.read_csv(db_file) data = pd.DataFrame() relevant_features = () for tuple_of_two in DAG: relevant_features = relevant_features + tuple_of_two for column in all_data: if column in relevant_features: data[column] = all_data[column] return data def BNLearning(self, DAG, db_file): self.data = self.take_only_relevant_features(DAG, db_file) self.model = BayesianModel(DAG) self.model.fit(self.data, BayesianEstimator) def BNTesting(self, results_file): # separate data for test training_part = int(0.8 * len(self.data)) testing_data = self.data[training_part:] # predict predict_data = testing_data.copy() predict_data.drop('song_popularity', axis=1, inplace=True) y_pred = self.model.predict(predict_data) with open(results_file, 'w', newline='') as file: y_pred.to_csv(file) def BNForOneSong(self, DAG, db_file, results_file, songFile): data = self.take_only_relevant_features(DAG, db_file) dataToPredictRF = self.take_only_relevant_features(DAG, songFile) dataToPredict = pd.read_csv(songFile) model = BayesianModel(DAG) model.fit(data, BayesianEstimator) dataToPredictRF = dataToPredictRF.copy() y_pred = model.predict(dataToPredictRF) # print(y_pred) with open(results_file, 'w', newline='') as file: y_pred.to_csv(file) return y_pred['song_popularity'][0]
def bayeSian(k): fileName = '文件名'; dataMat, dataLab = file2matrix(fileName, 9); durAct = dataMat[]; testMat = dataMat[]; count = 0; # testMat = dataMat[]; testLab = np.array(dataLab[]); trainFraK = pd.DataFrame(dataMat,columns=[columns_name]); trainFra = trainFraK.ix[]; # data_cla0 = trainFraK[trainFraK['T_TYPE']==0].values; # data_cla1 = trainFraK[trainFraK['T_TYPE']==1].values; trainInput = trainFraK[[columns_name]]; trainArr = np.zeros((dataMat.shape[0], 4), dtype='int64'); for arr in trainInput.values: trainArr[count, :]= map(int, arr); count += 1; trainInput = pd.DataFrame(trainArr, columns=[columns_name]); test = trainInput[]; test = test.copy(); test.drop('T_TYPE', axis=1, inplace=True); model = BayesianModel([('columns_name','columns_name'),('columns_name', 'columns_name'), ('columns_name', 'columns_name')]); model.fit(trainInput.ix[]); labelPre = model.predict(test); durPre = []; coef = 0.0; for i in range(len(testMat)): dataSet = trainFra[trainFra['T_TYPE']==labelPre['T_TYPE'][]].values; distPos = np.zeros(dataSet.shape[0]); distTim = np.zeros(dataSet.shape[0]); for j in range(dataSet.shape[0]): distPos[j] = distSLC(testMat[i], dataSet[j]); distTim[j] = disTim(testMat[i], dataSet[j]); distPosNor = distPos;#dataNorm(distPos); distTimNor = dataNorm(distTim); distAll = distPosNor*coef + distTimNor*(1-coef); knnIndex = distAll.argsort(); durKnn = dataSet[knnIndex, 7][:k]; durPre.append(sum(durKnn)/len(durKnn)); mse = calMse(durPre, durAct); mape = calMape(durPre, durAct); count = 0; #print labelPre.values.tolist(); for i in range(len(labelPre)): if labelPre.values[i]==testLab[i]: count += 1; print 'K: ', k; print '准确度: ', float(count)/len(testLab); print 'MSE: ', mse; print 'MAPE: ', mape; print '----------------------------------------------------------------------';
class TestBayesianModelFitPredict(unittest.TestCase): def setUp(self): self.model_disconnected = BayesianModel() self.model_disconnected.add_nodes_from(['A', 'B', 'C', 'D', 'E']) self.model_connected = BayesianModel([('A', 'B'), ('C', 'B'), ('C', 'D'), ('B', 'E')]) def test_disconnected_fit(self): values = pd.DataFrame(np.random.randint(low=0, high=2, size=(1000, 5)), columns=['A', 'B', 'C', 'D', 'E']) self.model_disconnected.fit(values) for node in ['A', 'B', 'C', 'D', 'E']: cpd = self.model_disconnected.get_cpds(node) self.assertEqual(cpd.variable, node) np_test.assert_array_equal(cpd.cardinality, np.array([2])) value = (values.ix[:, node].value_counts() / values.ix[:, node].value_counts().sum()) value = value.reindex(sorted(value.index)).values np_test.assert_array_equal(cpd.values, value) def test_connected_predict(self): np.random.seed(42) values = pd.DataFrame(np.random.randint(low=0, high=2, size=(1000, 5)), columns=['A', 'B', 'C', 'D', 'E']) fit_data = values[:800] predict_data = values[800:].copy() self.model_connected.fit(fit_data) self.assertRaises(ValueError, self.model_connected.predict, predict_data) predict_data.drop('E', axis=1, inplace=True) e_predict = self.model_connected.predict(predict_data) np_test.assert_array_equal( e_predict.values.ravel(), np.array([ 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0 ])) def tearDown(self): del self.model_connected del self.model_disconnected
class TestBayesianModelFitPredict(unittest.TestCase): def setUp(self): self.model_disconnected = BayesianModel() self.model_disconnected.add_nodes_from(['A', 'B', 'C', 'D', 'E']) self.model_connected = BayesianModel([('A', 'B'), ('C', 'B'), ('C', 'D'), ('B', 'E')]) def test_disconnected_fit(self): values = pd.DataFrame(np.random.randint(low=0, high=2, size=(1000, 5)), columns=['A', 'B', 'C', 'D', 'E']) self.model_disconnected.fit(values) for node in ['A', 'B', 'C', 'D', 'E']: cpd = self.model_disconnected.get_cpds(node) self.assertEqual(cpd.variable, node) np_test.assert_array_equal(cpd.cardinality, np.array([2])) value = (values.ix[:, node].value_counts() / values.ix[:, node].value_counts().sum()) value = value.reindex(sorted(value.index)).values np_test.assert_array_equal(cpd.values, value) def test_connected_predict(self): np.random.seed(42) values = pd.DataFrame(np.random.randint(low=0, high=2, size=(1000, 5)), columns=['A', 'B', 'C', 'D', 'E']) fit_data = values[:800] predict_data = values[800:].copy() self.model_connected.fit(fit_data) self.assertRaises(ValueError, self.model_connected.predict, predict_data) predict_data.drop('E', axis=1, inplace=True) e_predict = self.model_connected.predict(predict_data) np_test.assert_array_equal(e_predict.values.ravel(), np.array([1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0])) def tearDown(self): del self.model_connected del self.model_disconnected
def test_predict(self): titanic = BayesianModel() titanic.add_edges_from([("Sex", "Survived"), ("Pclass", "Survived")]) titanic.fit(self.titanic_data2[500:]) p1 = titanic.predict(self.titanic_data2[["Sex", "Pclass"]][:30]) p2 = titanic.predict(self.titanic_data2[["Survived", "Pclass"]][:30]) p3 = titanic.predict(self.titanic_data2[["Survived", "Sex"]][:30]) p1_res = np.array(['0', '1', '0', '1', '0', '0', '0', '0', '0', '1', '0', '1', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0']) p2_res = np.array(['male', 'female', 'female', 'female', 'male', 'male', 'male', 'male', 'female', 'female', 'female', 'female', 'male', 'male', 'male', 'female', 'male', 'female', 'male', 'female', 'male', 'female', 'female', 'female', 'male', 'female', 'male', 'male', 'female', 'male']) p3_res = np.array(['3', '1', '1', '1', '3', '3', '3', '3', '1', '1', '1', '1', '3', '3', '3', '1', '3', '1', '3', '1', '3', '1', '1', '1', '3', '1', '3', '3', '1', '3']) np_test.assert_array_equal(p1.values.ravel(), p1_res) np_test.assert_array_equal(p2.values.ravel(), p2_res) np_test.assert_array_equal(p3.values.ravel(), p3_res)
def kNN(k): fileName = ''; dataMat, dataLab = file2matrix(fileName, 9); trainMat = dataMat[]; trainLab = np.array(dataLab[]); testMat = dataMat[]; testLab = np.array(dataLab[]); coef = 1; distPos = np.zeros((testMat.shape[0], trainMat.shape[0])); distTim = np.zeros((testMat.shape[0], trainMat.shape[0])); for i in range(testMat.shape[0]): for j in range(trainMat.shape[0]): distPos[i,j] = distSLC(testMat[i], trainMat[j]); distTim[i,j] = disTim(testMat[i], trainMat[j]); distPosNor = dataNorm(distPos); distTimNor = dataNorm(distTim); distAll = distPosNor*coef + distTimNor*(1-coef); distIndex = distAll.argsort(); testI = np.zeros((testMat.shape[0], 4), dtype='int32'); count = 0; for i in testMat[:, 2:6]: testI[count,:] = map(int, i); count += 1; testInput = pd.DataFrame(testI, columns=[]); trainMatK = trainMat[distIndex[:,0:k]]; labelPre = []; for i in range(len(trainMatK)): num = 0; trainI = np.zeros((trainMatK[0].shape[0], 5), dtype='int32'); for j in trainMatK[i][:, [2,3,4,5,8]]: trainI[num, :] = map(int, j); num += 1; trainFraK = pd.DataFrame(trainI,columns=[]); trainInput = trainFraK[[]]; model = BayesianModel([(),(),(), ()]); model.fit(trainInput); a = pd.DataFrame([testInput.ix[i].values.tolist()], columns=[]); labelPre.append(model.predict(a).values[0][0]); # for i in range(len(testLakK)): # labels = testLakK[i]; # labelPre.append(getLabel(labels)); count = 0; #print labelPre; for i in range(len(labelPre)): if labelPre[i]==testLab[i]: count += 1; print '准确度:', float(count)/len(testLab);
def BNForOneSong(self, DAG, db_file, results_file, songFile): data = self.take_only_relevant_features(DAG, db_file) dataToPredictRF = self.take_only_relevant_features(DAG, songFile) dataToPredict = pd.read_csv(songFile) model = BayesianModel(DAG) model.fit(data, BayesianEstimator) dataToPredictRF = dataToPredictRF.copy() y_pred = model.predict(dataToPredictRF) # print(y_pred) with open(results_file, 'w', newline='') as file: y_pred.to_csv(file) return y_pred['song_popularity'][0]
def BN(DAG): data = take_only_relevant_features(DAG) training_data = data[:15068] predict_data = data[15068:16952] model = BayesianModel(DAG) model.fit(data, BayesianEstimator) predict_data = predict_data.copy() predict_data.drop('song_popularity', axis=1, inplace=True) y_pred = model.predict(predict_data) print(y_pred) with open('predicted_results.csv', 'w', newline='') as file: y_pred.to_csv(file)
'RTH': (1000, 200), 'FP': (2000, 200)} node_sizes = [1000 + 10 * i for i in range(len(graph))] M = graph.number_of_edges() edge_alphas = [(5 + i) / (M + 4) for i in range(M)] nodes = nx.draw_networkx_nodes(graph, pos, nodelist=nodelist, node_size=node_sizes, node_color="red", label='true') edges = nx.draw_networkx_edges(graph, pos, node_size=node_sizes, arrowstyle='->', arrowsize=15, edge_color="blue", edge_cmap=plt.cm.Blues, width=2) labels = nx.draw_networkx_labels(graph, pos) ax = plt.gca() ax.set_axis_off() plt.show() """ The data of 222 exploration wells are predicted by TAN, and the results are compared with the actual results. """ # actual_value indicates the results of exploration wells. actual_value = train_data["label"].values.tolist() # TAN predicts the results of exploration wells predict_data = train_data predict_data.drop('label', axis=1, inplace=True) Tan_pred_value = Tan.predict(predict_data) score = util.get_predict(predict_data=Tan_pred_value, label=actual_value) for i in score: print("class lable:", i) print("[accuracy rate ,Predict the correct number,Total number]") print(score[i])
data = np.random.randint(low=0, high=2, size=(1000, 3)) #print(data) data = pd.DataFrame(data, columns=[ 'cost', 'location', 'no_of_people' ]) #['cost', 'quality', 'location', 'no_of_people']) #print(data.loc[:,'no_of_people']) train = data[:750] y_true = data[750:]['no_of_people'].values #print("y_ture: " , y_true) test1 = data[750:] test = data[750:].drop('no_of_people', axis=1) #estimator , _ = create_BN_model(train) #pgm_test(estimator, test_set = test , target_column_name = 'no_of_people') restaurant_model = BayesianModel([ ('location', 'cost'), #('quality', 'cost'), ('location', 'no_of_people'), ('cost', 'no_of_people') ]) for est in [BayesianEstimator]: #MaximumLikelihoodEstimator restaurant_model.fit(train, estimator=est) #restaurant_model.get_cpds() a1 = restaurant_model.predict(test).values.ravel() a2 = pgm_test(restaurant_model, test_set=test1, target_column_name='no_of_people') print(est, '\n', a1, '\n', a2)
print('Query: Female, black') q = model_infer.query(variables=['class'], evidence={'sex': 0, 'race': 0}) print(q['class']) print('Query: Male, white') q = model_infer.query(variables=['class'], evidence={'sex': 1, 'race': 1}) print(q['class']) ################################################################################# ##### Evalutating the model by predicting ################################################################################# # use this line to reduce the dataset during testing only # data_test.drop(data_test.index[10:], inplace=True) # here goes the real stuff y_true = data_test['class'].copy() data_test.drop('class', axis=1, inplace=True) y_pred = model.predict(data_test) accuracy = accuracy_score(y_true, y_pred) print("\n\n\n\n\n\nAccuracy = ", accuracy) print( "\n\nSince our data is skewed we should take a deeper look in to the results:" ) print(classification_report(y_true, y_pred)) print("\nEnd of code \n...o0o.... F**k you Julien ...o0o...") print("\nRuntime: ") end = time.time() print(round(end - start), "seconds")
("Age", "Survived"), ("Sex", "Survived"), ("Fare", "Pclass"), ("Pclass", "Survived"), ("Cabin", "Survived"), ] ) model.fit( train, estimator=BayesianEstimator, prior_type="BDeu" ) # default equivalent_sample_size=5 # for cpd in model.get_cpds(): # print(cpd) predict_data = test.drop(columns=["Survived"], axis=1) y_pred = model.predict(predict_data) (y_pred["Survived"] == test["Survived"]).sum() / len(test) # 测试集精度 model_infer = VariableElimination(model) q = model_infer.query(variables=["Survived"], evidence={"Fare": 0}) print(q["Survived"]) q = model_infer.map_query( variables=["Fare", "Age", "Sex", "Pclass", "Cabin"], evidence={"Survived": 1} ) print(q) # # 用结构学习建立模型
class BayesianNetwork: """ Base class for Bayesian Network (BN), a probabilistic weighted DAG where nodes represent variables, edges represent the causal relationships between variables. ``BayesianNetwork`` stores nodes with their possible states, edges and conditional probability distributions (CPDs) of each node. ``BayesianNetwork`` is built on top of the ``StructureModel``, which is an extension of ``networkx.DiGraph`` (see :func:`causalnex.structure.structuremodel.StructureModel`). In order to define the ``BayesianNetwork``, users should provide a relevant ``StructureModel``. Once ``BayesianNetwork`` is initialised, no changes to the ``StructureModel`` can be made and CPDs can be learned from the data. The learned CPDs can be then used for likelihood estimation and predictions. Example: :: >>> # Create a Bayesian Network with a manually defined DAG. >>> from causalnex.structure import StructureModel >>> from causalnex.network import BayesianNetwork >>> >>> sm = StructureModel() >>> sm.add_edges_from([ >>> ('rush_hour', 'traffic'), >>> ('weather', 'traffic') >>> ]) >>> bn = BayesianNetwork(sm) >>> # A created ``BayesianNetwork`` stores nodes and edges defined by the ``StructureModel`` >>> bn.nodes ['rush_hour', 'traffic', 'weather'] >>> >>> bn.edges [('rush_hour', 'traffic'), ('weather', 'traffic')] >>> # A ``BayesianNetwork`` doesn't store any CPDs yet >>> bn.cpds >>> {} >>> >>> # Learn the nodes' states from the data >>> import pandas as pd >>> data = pd.DataFrame({ >>> 'rush_hour': [True, False, False, False, True, False, True], >>> 'weather': ['Terrible', 'Good', 'Bad', 'Good', 'Bad', 'Bad', 'Good'], >>> 'traffic': ['heavy', 'light', 'heavy', 'light', 'heavy', 'heavy', 'heavy'] >>> }) >>> bn = bn.fit_node_states(data) >>> bn.node_states {'rush_hour': {False, True}, 'weather': {'Bad', 'Good', 'Terrible'}, 'traffic': {'heavy', 'light'}} >>> # Learn the CPDs from the data >>> bn = bn.fit_cpds(data) >>> # Use the learned CPDs to make predictions on the unseen data >>> test_data = pd.DataFrame({ >>> 'rush_hour': [False, False, True, True], >>> 'weather': ['Good', 'Bad', 'Good', 'Bad'] >>> }) >>> bn.predict(test_data, "traffic").to_dict() >>> {'traffic_prediction': {0: 'light', 1: 'heavy', 2: 'heavy', 3: 'heavy'}} >>> bn.predict_probability(test_data, "traffic").to_dict() {'traffic_prediction': {0: 'light', 1: 'heavy', 2: 'heavy', 3: 'heavy'}} {'traffic_light': {0: 0.75, 1: 0.25, 2: 0.3333333333333333, 3: 0.3333333333333333}, 'traffic_heavy': {0: 0.25, 1: 0.75, 2: 0.6666666666666666, 3: 0.6666666666666666}} """ def __init__(self, structure: StructureModel): """ Create a ``BayesianNetwork`` with a DAG defined by ``StructureModel``. Args: structure: a graph representing a causal relationship between variables. In the structure - cycles are not allowed; - multiple (parallel) edges are not allowed; - isolated nodes and multiple components are not allowed. Raises: ValueError: If the structure is not a connected DAG. """ n_components = nx.number_weakly_connected_components(structure) if n_components > 1: raise ValueError( "The given structure has {n_components} separated graph components. " "Please make sure it has only one.".format( n_components=n_components)) if not nx.is_directed_acyclic_graph(structure): cycle = nx.find_cycle(structure) raise ValueError( "The given structure is not acyclic. Please review the following cycle: {cycle}" .format(cycle=cycle)) # _node_states is a Dict in the form `dict: {node: dict: {state: index}}`. # Underlying libraries expect all states to be integers from zero, and # thus this dict is used to convert from state -> idx, and then back from idx -> state as required self._node_states = None # type: Dict[str: Dict[Hashable, int]] self._structure = structure # _model is a pgmpy Bayesian Model. # It is used for: # - probability fitting # - predictions self._model = BayesianModel() self._model.add_edges_from(structure.edges) @property def structure(self) -> StructureModel: """ ``StructureModel`` defining the DAG of the Bayesian Network. Returns: A ``StructureModel`` of the Bayesian Network. """ return self._structure @property def nodes(self) -> List[str]: """ List of all nodes contained within the Bayesian Network. Returns: A list of node names. """ return list(self._model.nodes) @property def node_states(self) -> Dict[str, Set[Hashable]]: """ Dictionary of all states that each node can take. Returns: A dictionary of node and its possible states, in format of `dict: {node: state}`. """ return { node: set(states.keys()) for node, states in self._node_states.items() } @node_states.setter def node_states(self, nodes: Dict[str, Set[Hashable]]): """ Set the list of nodes that are contained within the Bayesian Network. The states of all nodes must be provided. Args: nodes: A dictionary of node and its possible states, in format of `dict: {node: state}`. Raises: ValueError: if a node contains a None state. KeyError: if a node is missing. """ missing_feature = set(self.nodes).difference(set(nodes.keys())) if missing_feature: raise KeyError( "The data does not cover all the features found in the Bayesian Network. " "Please check the following features: {nodes}".format( nodes=missing_feature)) for node, states in nodes.items(): if any(pd.isnull(list(states))): raise ValueError( "node '{node}' contains None state".format(node=node)) self._node_states = { n: {v: k for k, v in enumerate(sorted(nodes[n]))} for n in nodes } @property def edges(self) -> List[Tuple[str, str]]: """ List of all edges contained within the Bayesian Network, as a Tuple(from_node, to_node). Returns: A list of all edges. """ return list(self._model.edges) @property def cpds(self) -> Dict[str, pd.DataFrame]: """ Conditional Probability Distributions of each node within the Bayesian Network. The row-index of each dataframe is all possible states for the node. The col-index of each dataframe is a MultiIndex that describes all possible permutations of parent states. For example, for a node :math:`P(A | B, D)`, where .. math:: - A \\in \\text{{"a", "b", "c", "d"}} - B \\in \\text{{"x", "y", "z"}} - C \\in \\text{{False, True}} >>> b x y z >>> d False True False True False True >>> a >>> a 0.265306 0.214286 0.066667 0.25 0.444444 0.000000 >>> b 0.183673 0.214286 0.200000 0.25 0.222222 0.666667 >>> c 0.285714 0.285714 0.400000 0.25 0.333333 0.333333 >>> d 0.265306 0.285714 0.333333 0.25 0.000000 0.000000 Returns: Conditional Probability Distributions of each node within the Bayesian Network. """ cpds = dict() for cpd in self._model.cpds: iterables = [ sorted(self._node_states[var].keys()) for var in cpd.variables[1:] ] cols = [""] if iterables: cols = pd.MultiIndex.from_product(iterables, names=cpd.variables[1:]) cpds[cpd.variable] = pd.DataFrame( cpd.values.reshape(len(self._node_states[cpd.variable]), max(1, len(cols)))) cpds[cpd.variable][cpd.variable] = sorted( self._node_states[cpd.variable].keys()) cpds[cpd.variable].set_index([cpd.variable], inplace=True) cpds[cpd.variable].columns = cols return cpds def fit_node_states(self, df: pd.DataFrame) -> "BayesianNetwork": """ Fit all states of nodes that can appear in the data. The dataframe provided should contain every possible state (values that can be taken) for every column. Args: df: data to fit node states from. Each column indicates a node and each row an observed combination of states. Returns: self Raises: ValueError: if dataframe contains any missing data. """ self.node_states = {c: set(df[c].unique()) for c in df.columns} return self def _state_to_index(self, df: pd.DataFrame, nodes: List[str] = None) -> pd.DataFrame: """ Transforms all values in df to an integer, as defined by the mapping from fit_node_states. Args: df: data to transform nodes: list of nodes to map to index. None means all. Returns: The transformed dataframe. Raises: ValueError: if nodes have not been fit, or if column names do not match node names. """ df.is_copy = False cols = nodes if nodes else df.columns for col in cols: df[col] = df[col].map(self._node_states[col]) df.is_copy = True return df def fit_cpds( self, data: pd.DataFrame, method: str = "MaximumLikelihoodEstimator", bayes_prior: str = None, equivalent_sample_size: int = None, ) -> "BayesianNetwork": """ Learn conditional probability distributions for all nodes in the Bayesian Network, conditioned on their incoming edges (parents). Args: data: dataframe containing one column per node in the Bayesian Network. method: how to fit probabilities. One of: - "MaximumLikelihoodEstimator": fit probabilities using Maximum Likelihood Estimation; - "BayesianEstimator": fit probabilities using Bayesian Parameter Estimation. Use bayes_prior. bayes_prior: how to construct the Bayesian prior used by method="BayesianEstimator". One of: - "K2": shorthand for dirichlet where all pseudo_counts are 1 regardless of variable cardinality; - "BDeu": equivalent of using Dirichlet and using uniform 'pseudo_counts' of `equivalent_sample_size / (node_cardinality * np.prod(parents_cardinalities))` for each node. Use equivelant_sample_size. equivalent_sample_size: used by BDeu bayes_prior to compute pseudo_counts. Returns: self Raises: ValueError: if an invalid method or bayes_prior is specified. """ state_names = { k: list(v.values()) for k, v in self._node_states.items() } transformed_data = data.copy(deep=True) # type: pd.DataFrame transformed_data = self._state_to_index(transformed_data[self.nodes]) if method == "MaximumLikelihoodEstimator": self._model.fit( data=transformed_data, estimator=MaximumLikelihoodEstimator, state_names=state_names, ) elif method == "BayesianEstimator": valid_bayes_priors = ["BDeu", "K2"] if bayes_prior not in valid_bayes_priors: raise ValueError( "unrecognised bayes_prior, please use on of %s" % " ".join(valid_bayes_priors)) self._model.fit( data=transformed_data, estimator=BayesianEstimator, prior_type=bayes_prior, equivalent_sample_size=equivalent_sample_size, state_names=state_names, ) else: valid_methods = ["MaximumLikelihoodEstimator", "BayesianEstimator"] raise ValueError("unrecognised method, please use on of %s" % " ".join(valid_methods)) return self def fit_node_states_and_cpds( self, data: pd.DataFrame, method: str = "MaximumLikelihoodEstimator", bayes_prior: str = None, equivalent_sample_size: int = None, ) -> "BayesianNetwork": """ Call `fit_node_states` and then `fit_cpds`. Args: data: dataframe containing one column per node in the Bayesian Network. method: how to fit probabilities. One of: - "MaximumLikelihoodEstimator": fit probabilities using Maximum Likelihood Estimation; - "BayesianEstimator": fit probabilities using Bayesian Parameter Estimation. Use bayes_prior. bayes_prior: how to construct the Bayesian prior used by method="BayesianEstimator". One of: - "K2": shorthand for dirichlet where all pseudo_counts are 1 regardless of variable cardinality; - "BDeu": equivalent of using dirichlet and using uniform 'pseudo_counts' of `equivalent_sample_size / (node_cardinality * np.prod(parents_cardinalities))` for each node. Use equivelant_sample_size. equivalent_sample_size: used by BDeu bayes_prior to compute pseudo_counts. Returns: self """ return self.fit_node_states(data).fit_cpds(data, method, bayes_prior, equivalent_sample_size) def predict(self, data: pd.DataFrame, node: str) -> pd.DataFrame: """ Predict the state of a node based on some input data, using the Bayesian Network. Args: data: data to make prediction. node: the node to predict. Returns: A dataframe of predictions, containing a single column name {node}_prediction. """ if all(parent in data.columns for parent in self._model.get_parents(node)): return self._predict_from_complete_data(data, node) return self._predict_from_incomplete_data(data, node) def _predict_from_complete_data(self, data: pd.DataFrame, node: str) -> pd.DataFrame: """ Predicts state of node given all parents of node exist within data. This method inspects the CPD of node directly, since all parent states are known. This avoids traversing the full network to compute marginals. This method is fast. Args: data: data to make prediction. node: the node to predict. Returns: A dataframe of predictions, containing a single column named {node}_prediction. """ transformed_data = data.copy(deep=True) # type: pd.DataFrame parents = sorted(self._model.get_parents(node)) cpd = self.cpds[node] transformed_data["{node}_prediction".format( node=node)] = transformed_data.apply( lambda row: cpd[tuple(row[parent] for parent in parents)].idxmax() if parents else cpd[""].idxmax(), axis=1, ) return transformed_data[[node + "_prediction"]] def _predict_from_incomplete_data(self, data: pd.DataFrame, node: str) -> pd.DataFrame: """ Predicts state of node when some parents of node do not exist within data. This method uses the pgmpy predict function, which predicts the most likely state for every node that is not contained within data. With incomplete data, pgmpy goes beyond parents in the network to determine the most likely predictions. This method is slow. Args: data: data to make prediction. node: the node to predict. Returns: A dataframe of predictions, containing a single column name {node}_prediction. """ transformed_data = deepcopy(data) # type: pd.DataFrame self._state_to_index(transformed_data) # transformed_data.is_copy() # pgmpy will predict all missing data, so drop column we want to predict transformed_data = transformed_data.drop(columns=[node]) predictions = self._model.predict(transformed_data)[[node]] return predictions.rename(columns={node: node + "_prediction"}) def predict_probability(self, data: pd.DataFrame, node: str) -> pd.DataFrame: """ Predict the probability of each possible state of a node, based on some input data. Args: data: data to make prediction. node: the node to predict probabilities. Returns: A dataframe of predicted probabilities, contained one column per possible state, named {node}_{state}. """ if all(parent in data.columns for parent in self._model.get_parents(node)): return self._predict_probability_from_complete_data(data, node) return self._predict_probability_from_incomplete_data(data, node) def _predict_probability_from_complete_data(self, data: pd.DataFrame, node: str) -> pd.DataFrame: """ Predict the probability of each possible state of a node, based on some input data. This method inspects the CPD of node directly, since all parent states are known. This avoids traversing the full network to compute marginals. This method is fast. Args: data: data to make prediction. node: the node to predict probabilities. Returns: A dataframe of predicted probabilities, contained one column per possible state, named {node}_{state}. """ transformed_data = data.copy(deep=True) # type: pd.DataFrame parents = sorted(self._model.get_parents(node)) cpd = self.cpds[node] def lookup_probability(row, s): """Retrieve probability from CPD""" if parents: return cpd[tuple(row[parent] for parent in parents)].loc[s] return cpd.at[s, ""] for state in self.node_states[node]: transformed_data["{n}_{s}".format( n=node, s=state)] = transformed_data.apply( lambda row, st=state: lookup_probability(row, st), axis=1) return transformed_data[[ "{n}_{s}".format(n=node, s=state) for state in self.node_states[node] ]] def _predict_probability_from_incomplete_data(self, data: pd.DataFrame, node: str) -> pd.DataFrame: """ Predict the probability of each possible state of a node, based on some input data. This method uses the pgmpy predict_probability function, which predicts the probability of every state for every node that is not contained within data. With incomplete data, pgmpy goes beyond parents in the network to determine the most likely predictions. This method is slow. Args: data: data to make prediction. node: the node to predict probabilities. Returns: A dataframe of predicted probabilities, contained one column per possible state, named {node}_{state}. """ transformed_data = data.copy(deep=True) # type: pd.DataFrame self._state_to_index(transformed_data) # pgmpy will predict all missing data, so drop column we want to predict transformed_data = transformed_data.drop(columns=[node]) probability = self._model.predict_probability( transformed_data) # type: pd.DataFrame # keep only probabilities for the node we are interested in cols = [] pattern = re.compile("^{node}_[0-9]+$".format(node=node)) # disabled open pylint issue (https://github.com/PyCQA/pylint/issues/2962) for col in probability.columns: if pattern.match(col): cols.append(col) probability = probability[cols] probability.columns = cols return probability
class TestBayesianModelFitPredict(unittest.TestCase): def setUp(self): self.model_disconnected = BayesianModel() self.model_disconnected.add_nodes_from(['A', 'B', 'C', 'D', 'E']) self.model_connected = BayesianModel([('A', 'B'), ('C', 'B'), ('C', 'D'), ('B', 'E')]) self.model2 = BayesianModel([('A', 'C'), ('B', 'C')]) self.data1 = pd.DataFrame(data={ 'A': [0, 0, 1], 'B': [0, 1, 0], 'C': [1, 1, 0] }) self.data2 = pd.DataFrame( data={ 'A': [0, np.NaN, 1], 'B': [0, 1, 0], 'C': [1, 1, np.NaN], 'D': [np.NaN, 'Y', np.NaN] }) # data_link - "https://www.kaggle.com/c/titanic/download/train.csv" self.titanic_data = pd.read_csv( 'pgmpy/tests/test_estimators/testdata/titanic_train.csv', dtype=str) self.titanic_data2 = self.titanic_data[["Survived", "Sex", "Pclass"]] def test_bayesian_fit(self): print(isinstance(BayesianEstimator, BaseEstimator)) print(isinstance(MaximumLikelihoodEstimator, BaseEstimator)) self.model2.fit(self.data1, estimator=BayesianEstimator, prior_type="dirichlet", pseudo_counts=[9, 3]) self.assertEqual(self.model2.get_cpds('B'), TabularCPD('B', 2, [[11.0 / 15], [4.0 / 15]])) def test_fit_missing_data(self): self.model2.fit(self.data2, state_names={'C': [0, 1]}, complete_samples_only=False) cpds = set([ TabularCPD('A', 2, [[0.5], [0.5]]), TabularCPD('B', 2, [[2. / 3], [1. / 3]]), TabularCPD('C', 2, [[0, 0.5, 0.5, 0.5], [1, 0.5, 0.5, 0.5]], evidence=['A', 'B'], evidence_card=[2, 2]) ]) self.assertSetEqual(cpds, set(self.model2.get_cpds())) def test_disconnected_fit(self): values = pd.DataFrame(np.random.randint(low=0, high=2, size=(1000, 5)), columns=['A', 'B', 'C', 'D', 'E']) self.model_disconnected.fit(values) for node in ['A', 'B', 'C', 'D', 'E']: cpd = self.model_disconnected.get_cpds(node) self.assertEqual(cpd.variable, node) np_test.assert_array_equal(cpd.cardinality, np.array([2])) value = (values.ix[:, node].value_counts() / values.ix[:, node].value_counts().sum()) value = value.reindex(sorted(value.index)).values np_test.assert_array_equal(cpd.values, value) def test_predict(self): titanic = BayesianModel() titanic.add_edges_from([("Sex", "Survived"), ("Pclass", "Survived")]) titanic.fit(self.titanic_data2[500:]) p1 = titanic.predict(self.titanic_data2[["Sex", "Pclass"]][:30]) p2 = titanic.predict(self.titanic_data2[["Survived", "Pclass"]][:30]) p3 = titanic.predict(self.titanic_data2[["Survived", "Sex"]][:30]) p1_res = np.array([ '0', '1', '0', '1', '0', '0', '0', '0', '0', '1', '0', '1', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0' ]) p2_res = np.array([ 'male', 'female', 'female', 'female', 'male', 'male', 'male', 'male', 'female', 'female', 'female', 'female', 'male', 'male', 'male', 'female', 'male', 'female', 'male', 'female', 'male', 'female', 'female', 'female', 'male', 'female', 'male', 'male', 'female', 'male' ]) p3_res = np.array([ '3', '1', '1', '1', '3', '3', '3', '3', '1', '1', '1', '1', '3', '3', '3', '1', '3', '1', '3', '1', '3', '1', '1', '1', '3', '1', '3', '3', '1', '3' ]) np_test.assert_array_equal(p1.values.ravel(), p1_res) np_test.assert_array_equal(p2.values.ravel(), p2_res) np_test.assert_array_equal(p3.values.ravel(), p3_res) def test_connected_predict(self): np.random.seed(42) values = pd.DataFrame(np.array(np.random.randint(low=0, high=2, size=(1000, 5)), dtype=str), columns=['A', 'B', 'C', 'D', 'E']) fit_data = values[:800] predict_data = values[800:].copy() self.model_connected.fit(fit_data) self.assertRaises(ValueError, self.model_connected.predict, predict_data) predict_data.drop('E', axis=1, inplace=True) e_predict = self.model_connected.predict(predict_data) np_test.assert_array_equal( e_predict.values.ravel(), np.array([ 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0 ], dtype=str)) def test_connected_predict_probability(self): np.random.seed(42) values = pd.DataFrame(np.random.randint(low=0, high=2, size=(100, 5)), columns=['A', 'B', 'C', 'D', 'E']) fit_data = values[:80] predict_data = values[80:].copy() self.model_connected.fit(fit_data) predict_data.drop('E', axis=1, inplace=True) e_prob = self.model_connected.predict_probability(predict_data) np_test.assert_allclose( e_prob.values.ravel(), np.array([ 0.57894737, 0.42105263, 0.57894737, 0.42105263, 0.57894737, 0.42105263, 0.5, 0.5, 0.57894737, 0.42105263, 0.5, 0.5, 0.57894737, 0.42105263, 0.57894737, 0.42105263, 0.57894737, 0.42105263, 0.5, 0.5, 0.57894737, 0.42105263, 0.57894737, 0.42105263, 0.5, 0.5, 0.57894737, 0.42105263, 0.57894737, 0.42105263, 0.5, 0.5, 0.57894737, 0.42105263, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5 ]), atol=0) predict_data = pd.DataFrame(np.random.randint(low=0, high=2, size=(1, 5)), columns=['A', 'B', 'C', 'F', 'E'])[:] def test_predict_probability_errors(self): np.random.seed(42) values = pd.DataFrame(np.random.randint(low=0, high=2, size=(2, 5)), columns=['A', 'B', 'C', 'D', 'E']) fit_data = values[:1] predict_data = values[1:].copy() self.model_connected.fit(fit_data) self.assertRaises(ValueError, self.model_connected.predict_probability, predict_data) predict_data = pd.DataFrame(np.random.randint(low=0, high=2, size=(1, 5)), columns=['A', 'B', 'C', 'F', 'E'])[:] self.assertRaises(ValueError, self.model_connected.predict_probability, predict_data) def tearDown(self): del self.model_connected del self.model_disconnected
# Now in general machine learning problems it doesn't matter which # column of the array represents which variable (until we use same # order for both training and prediction) because all the values # are on symmetrical axis but in graphical models each variable is # different (in the way it is connected to other variables etc) so # we will need to specify which columns of data are for which # variable. For that we will use pandas. import pandas as pd data = pd.DataFrame(data, columns=['cost', 'quality', 'location', 'no_of_people']) data train = data[:750] # We will try to predict the no_of_people from our model. So for # test data we will delete that column and then later on predict # those values. test = data[750:].drop('no_of_people', axis=1) test # Now we will need to create the base network structure for the # model. restaurant_model = BayesianModel([('location', 'cost'), ('quality', 'cost'), ('location', 'no_of_people'), ('cost', 'no_of_people')]) restaurant_model.fit(train) # Fit computes the cpd of all the variables from the training data # that we provided. restaurant_model.get_cpds() # Now for predicting the values of no_of_people using this model # we can simply call the predict method on our test data. restaurant_model.predict(test).values.ravel()
def bic(train, test, scoring_function, resultlist): #print(set(train['Person'].values)) #print(set(train['c0'].values)) #print(set(train['c1'].values)) #print(len(test)) #print('################') array = ['Person'] trainstart = time.time() #bic=BicScore(train) sc = scoring_function(train) hc = HillClimbSearch(train, scoring_method=sc) best_model = hc.estimate() #print("best_model.edges:" , best_model.edges()) #edges=[('c3', 'c2'), ('c3', 'c5'), ('c3', 'c1'), ('c3', 'Person'), ('Person', 'c2'), ('Person', 'c5'), ('Person', 'c1')] edges = best_model.edges() model = BayesianModel(edges) model.fit(train, estimator=BayesianEstimator, prior_type="BDeu") trainend = time.time() - trainstart #for n in model.nodes(): # print(model.get_cpds(n)) #print("nodes:", model.nodes()) #print("test column:", test.columns) flag = 0 if (set(model.nodes()) - set(array) == set(model.nodes())): flag = 1 elif (set(model.nodes()) - set(array) == set(test.columns)): teststart = time.time() #print(test) result = model.predict(test).values.ravel() testend = time.time() - teststart pred = list(result) #print("y_true: \n" , resultlist , "\ny_predicted:\n" , pred) else: indicator = list(set(test.columns) - set(model.nodes())) #print("indicator:\n" , indicator) #print("come in testchange***********************") #print("before cahnge:" , len(test)) testchange = test.copy() #print(testchange) for f in range(len(indicator)): #print(f) del testchange[indicator[f]] #print(testchange) #print("after cahnge:" , len(testchange)) teststart = time.time() result = model.predict(testchange).values.ravel() testend = time.time() - teststart pred = list(result) #print("y_true: \n" , resultlist , "\ny_predicted:\n" , pred) #model_data = XMLBIFWriter(model) #model_data.write_xmlbif(address+name+'_bic.bif') if flag == 1: print('##############flag:', flag) if (flag == 0): #fscore,accuracy,precision,recall=calscore(resultlist,pred) scores = calculate_different_metrics(y_true=resultlist, y_predicted=pred) #draw(model.edges(),name,"bic",folder) #WriteData(address+"bicpred\\",name+".xlsx",name,pred) else: fscore = accuracy = precision = recall = trainend = testend = 0 scores = { 'f1_score_micro': 0, 'f1_score_macro': 0, 'f1_score_binary': 0, 'precision': 0, 'recall': 0, 'accuracy': 0 } #print("set(pred)", set(pred)) #print("set(resultlist):", set(resultlist)) #print("fscore:" , fscore,"accuracy:" ,accuracy,"precision:" ,precision, "recall: ",recall) #print("scores:", scores) return (model, scores, trainend, testend, pred)
class TestBayesianModelFitPredict(unittest.TestCase): def setUp(self): self.model_disconnected = BayesianModel() self.model_disconnected.add_nodes_from(['A', 'B', 'C', 'D', 'E']) self.model_connected = BayesianModel([('A', 'B'), ('C', 'B'), ('C', 'D'), ('B', 'E')]) self.model2 = BayesianModel([('A', 'C'), ('B', 'C')]) self.data1 = pd.DataFrame(data={ 'A': [0, 0, 1], 'B': [0, 1, 0], 'C': [1, 1, 0] }) self.data2 = pd.DataFrame( data={ 'A': [0, np.NaN, 1], 'B': [0, 1, 0], 'C': [1, 1, np.NaN], 'D': [np.NaN, 'Y', np.NaN] }) def test_bayesian_fit(self): print(isinstance(BayesianEstimator, BaseEstimator)) print(isinstance(MaximumLikelihoodEstimator, BaseEstimator)) self.model2.fit(self.data1, estimator_type=BayesianEstimator, prior_type="dirichlet", pseudo_counts=[9, 3]) self.assertEqual(self.model2.get_cpds('B'), TabularCPD('B', 2, [[11.0 / 15], [4.0 / 15]])) def test_fit_missing_data(self): self.model2.fit(self.data2, state_names={'C': [0, 1]}, complete_samples_only=False) cpds = set([ TabularCPD('A', 2, [[0.5], [0.5]]), TabularCPD('B', 2, [[2. / 3], [1. / 3]]), TabularCPD('C', 2, [[0, 0.5, 0.5, 0.5], [1, 0.5, 0.5, 0.5]], evidence=['A', 'B'], evidence_card=[2, 2]) ]) self.assertSetEqual(cpds, set(self.model2.get_cpds())) def test_disconnected_fit(self): values = pd.DataFrame(np.random.randint(low=0, high=2, size=(1000, 5)), columns=['A', 'B', 'C', 'D', 'E']) self.model_disconnected.fit(values) for node in ['A', 'B', 'C', 'D', 'E']: cpd = self.model_disconnected.get_cpds(node) self.assertEqual(cpd.variable, node) np_test.assert_array_equal(cpd.cardinality, np.array([2])) value = (values.ix[:, node].value_counts() / values.ix[:, node].value_counts().sum()) value = value.reindex(sorted(value.index)).values np_test.assert_array_equal(cpd.values, value) def test_connected_predict(self): np.random.seed(42) values = pd.DataFrame(np.random.randint(low=0, high=2, size=(1000, 5)), columns=['A', 'B', 'C', 'D', 'E']) fit_data = values[:800] predict_data = values[800:].copy() self.model_connected.fit(fit_data) self.assertRaises(ValueError, self.model_connected.predict, predict_data) predict_data.drop('E', axis=1, inplace=True) e_predict = self.model_connected.predict(predict_data) np_test.assert_array_equal( e_predict.values.ravel(), np.array([ 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0 ])) def tearDown(self): del self.model_connected del self.model_disconnected
Created on Oct 27, 2017 @author: Adele ''' import numpy as np import pandas data = pandas.read_csv("kaggle.csv") data2 = data[["Survived", "Sex", "Pclass"]] #data2 = data[["Survived", "Sex", "Pclass"]].replace(["female", "male"], [0, 1]).replace({"Pclass": {3: 0}}) intrain = np.random.rand(len(data2)) < 0.8 dtrain = data2[intrain] dtest = data2[~intrain] ##print(len(dtrain), len(dtest)) from pgmpy.models import BayesianModel titanic = BayesianModel() titanic.add_edges_from([("Sex", "Survived"), ("Pclass", "Survived")]) titanic.fit(dtrain) for cpd in titanic.get_cpds(): print(cpd) print(dtest[["Sex", "Pclass"]]) titanic.predict(dtest[["Sex", "Pclass"]])
dot = Digraph(node_attr=node_attr, graph_attr=dict(size="12,12")) seen = set() edges=model.edges for a, b in edges: dot.edge(a, b) if save: dot.view(cleanup=True) return dot predict_data=test.drop(columns=['scene'],axis='1') # re=pd.read_csv('./re.txt') # print(re.info()) # print(predict_data.info()) print("预测数据集") print(predict_data) y_pred = model.predict(predict_data) showBN(model) print("预测结果") print(y_pred) # 预测结果 print("节点条件概率情况") print(model.get_cpds()) # 各个节点条件概率情况 # re['doors'] = re['doors'].astype('object') # print(model.predict_probability(re)) # 预测概率 print("预测准确率") print((y_pred['scene']==test['scene']).sum()/len(test)) end=time.process_time()
def BayesianModel(data): from pgmpy.models import BayesianModel from pgmpy.factors.discrete import TabularCPD import pandas as pd import numpy as np # 通过边来定义贝叶斯模型 """ x1:空气滤清器 x2:空气流量计 x3:节气门连接件 x4:油泵电路 x5:喷油器 x6:燃油压力调节器 y1:进气系统 y2:燃油系统 o1:发动机 """ model = BayesianModel([('x1', 'y1'), ('x2', 'y1'), ('x3', 'y1'), ('x4', 'y2'), ('x5', 'y2'), ('x6', 'y2'), ('y1', 'o1'), ('y2', 'o1')]) # 定义条件概率分布 cpd_x1 = TabularCPD(variable='x1', variable_card=2, values=[[0.05, 0.95]]) cpd_x2 = TabularCPD(variable='x2', variable_card=2, values=[[0.05, 0.95]]) cpd_x3 = TabularCPD(variable='x3', variable_card=2, values=[[0.05, 0.95]]) cpd_x4 = TabularCPD(variable='x4', variable_card=2, values=[[0.1, 0.9]]) cpd_x5 = TabularCPD(variable='x5', variable_card=2, values=[[0.075, 0.925]]) cpd_x6 = TabularCPD(variable='x6', variable_card=2, values=[[0.005, 0.995]]) # variable:变量 # variable_card:基数 # values:变量值 # evidence: cpd_y1 = TabularCPD( variable='y1', variable_card=2, values=[[0.06, 0.2, 0.15, 0.18, 0.32, 0.34, 0.43, 0.62], [0.94, 0.8, 0.85, 0.82, 0.68, 0.66, 0.57, 0.38]], evidence=['x1', 'x2', 'x3'], evidence_card=[2, 2, 2]) cpd_y2 = TabularCPD(variable='y2', variable_card=2, values=[[0.05, 0.3, 0.32, 0.29, 0.44, 0.42, 0.45, 0.6], [0.95, 0.7, 0.68, 0.71, 0.56, 0.58, 0.55, 0.4]], evidence=['x4', 'x5', 'x6'], evidence_card=[2, 2, 2]) cpd_o1 = TabularCPD(variable='o1', variable_card=2, values=[[0.03, 0.55, 0.6, 0.7], [0.97, 0.45, 0.4, 0.3]], evidence=['y1', 'y2'], evidence_card=[2, 2]) # 将有向无环图与条件概率分布表关联 model.add_cpds(cpd_x1, cpd_x2, cpd_x3, cpd_x4, cpd_x5, cpd_x6, cpd_y1, cpd_y2, cpd_o1) # 验证模型:检查网络结构和CPD,并验证CPD是否正确定义和总和为1 model.check_model() #value = {'x1':1,'x2':1,'x3':1,'x4':1,'x5':1,'x6':1} #value = np.array(value) #print(value) data = pd.DataFrame(data, index=[0]) #values = pd.DataFrame(np.random.randint(low=0, high=2, size=(1, 6)), #columns=['x1', 'x2', 'x3', 'x4', 'x5','x6']) #(values) #predict=model.predict_probability(values) predict = model.predict(data) predict_value = predict.get_value(0, 'o1') if (predict_value == 1): print("result:The car have error") result = '1' else: print("result:The car don't have error") result = '0' return result
class BayesianNetwork: """ Base class for Bayesian Network (BN), a probabilistic weighted DAG where nodes represent variables, edges represent the causal relationships between variables. ``BayesianNetwork`` stores nodes with their possible states, edges and conditional probability distributions (CPDs) of each node. ``BayesianNetwork`` is built on top of the ``StructureModel``, which is an extension of ``networkx.DiGraph`` (see :func:`causalnex.structure.structuremodel.StructureModel`). In order to define the ``BayesianNetwork``, users should provide a relevant ``StructureModel``. Once ``BayesianNetwork`` is initialised, no changes to the ``StructureModel`` can be made and CPDs can be learned from the data. The learned CPDs can be then used for likelihood estimation and predictions. Example: :: >>> # Create a Bayesian Network with a manually defined DAG. >>> from causalnex.structure import StructureModel >>> from causalnex.network import BayesianNetwork >>> >>> sm = StructureModel() >>> sm.add_edges_from([ >>> ('rush_hour', 'traffic'), >>> ('weather', 'traffic') >>> ]) >>> bn = BayesianNetwork(sm) >>> # A created ``BayesianNetwork`` stores nodes and edges defined by the ``StructureModel`` >>> bn.nodes ['rush_hour', 'traffic', 'weather'] >>> >>> bn.edges [('rush_hour', 'traffic'), ('weather', 'traffic')] >>> # A ``BayesianNetwork`` doesn't store any CPDs yet >>> bn.cpds >>> {} >>> >>> # Learn the nodes' states from the data >>> import pandas as pd >>> data = pd.DataFrame({ >>> 'rush_hour': [True, False, False, False, True, False, True], >>> 'weather': ['Terrible', 'Good', 'Bad', 'Good', 'Bad', 'Bad', 'Good'], >>> 'traffic': ['heavy', 'light', 'heavy', 'light', 'heavy', 'heavy', 'heavy'] >>> }) >>> bn = bn.fit_node_states(data) >>> bn.node_states {'rush_hour': {False, True}, 'weather': {'Bad', 'Good', 'Terrible'}, 'traffic': {'heavy', 'light'}} >>> # Learn the CPDs from the data >>> bn = bn.fit_cpds(data) >>> # Use the learned CPDs to make predictions on the unseen data >>> test_data = pd.DataFrame({ >>> 'rush_hour': [False, False, True, True], >>> 'weather': ['Good', 'Bad', 'Good', 'Bad'] >>> }) >>> bn.predict(test_data, "traffic").to_dict() >>> {'traffic_prediction': {0: 'light', 1: 'heavy', 2: 'heavy', 3: 'heavy'}} >>> bn.predict_probability(test_data, "traffic").to_dict() {'traffic_prediction': {0: 'light', 1: 'heavy', 2: 'heavy', 3: 'heavy'}} {'traffic_light': {0: 0.75, 1: 0.25, 2: 0.3333333333333333, 3: 0.3333333333333333}, 'traffic_heavy': {0: 0.25, 1: 0.75, 2: 0.6666666666666666, 3: 0.6666666666666666}} """ def __init__(self, structure: StructureModel): """ Create a ``BayesianNetwork`` with a DAG defined by ``StructureModel``. Args: structure: a graph representing a causal relationship between variables. In the structure - cycles are not allowed; - multiple (parallel) edges are not allowed; - isolated nodes and multiple components are not allowed. Raises: ValueError: If the structure is not a connected DAG. """ n_components = nx.number_weakly_connected_components(structure) if n_components > 1: raise ValueError( f"The given structure has {n_components} separated graph components. " "Please make sure it has only one." ) if not nx.is_directed_acyclic_graph(structure): cycle = nx.find_cycle(structure) raise ValueError( f"The given structure is not acyclic. Please review the following cycle: {cycle}" ) # _node_states is a Dict in the form `dict: {node: dict: {state: index}}`. # Underlying libraries expect all states to be integers from zero, and # thus this dict is used to convert from state -> idx, and then back from idx -> state as required self._node_states = {} # type: Dict[str: Dict[Hashable, int]] self._structure = structure # _model is a pgmpy Bayesian Model. # It is used for: # - probability fitting # - predictions self._model = BayesianModel() self._model.add_edges_from(structure.edges) @property def structure(self) -> StructureModel: """ ``StructureModel`` defining the DAG of the Bayesian Network. Returns: A ``StructureModel`` of the Bayesian Network. """ return self._structure @property def nodes(self) -> List[str]: """ List of all nodes contained within the Bayesian Network. Returns: A list of node names. """ return list(self._model.nodes) @property def node_states(self) -> Dict[str, Set[Hashable]]: """ Dictionary of all states that each node can take. Returns: A dictionary of node and its possible states, in format of `dict: {node: state}`. """ return {node: set(states.keys()) for node, states in self._node_states.items()} @node_states.setter def node_states(self, nodes: Dict[str, Set[Hashable]]): """ Set the list of nodes that are contained within the Bayesian Network. The states of all nodes must be provided. Args: nodes: A dictionary of node and its possible states, in format of `dict: {node: state}`. Raises: ValueError: if a node contains a None state. KeyError: if a node is missing. """ missing_feature = set(self.nodes).difference(set(nodes.keys())) if missing_feature: raise KeyError( "The data does not cover all the features found in the Bayesian Network. " f"Please check the following features: {missing_feature}" ) self._node_states = {} for node, states in nodes.items(): if any(pd.isnull(list(states))): raise ValueError(f"node '{node}' contains None state") self._node_states[node] = {v: k for k, v in enumerate(sorted(states))} @property def edges(self) -> List[Tuple[str, str]]: """ List of all edges contained within the Bayesian Network, as a Tuple(from_node, to_node). Returns: A list of all edges. """ return list(self._model.edges) @property def cpds(self) -> Dict[str, pd.DataFrame]: """ Conditional Probability Distributions of each node within the Bayesian Network. The row-index of each dataframe is all possible states for the node. The col-index of each dataframe is a MultiIndex that describes all possible permutations of parent states. For example, for a node :math:`P(A | B, D)`, where .. math:: - A \\in \\text{{"a", "b", "c", "d"}} - B \\in \\text{{"x", "y", "z"}} - C \\in \\text{{False, True}} >>> b x y z >>> d False True False True False True >>> a >>> a 0.265306 0.214286 0.066667 0.25 0.444444 0.000000 >>> b 0.183673 0.214286 0.200000 0.25 0.222222 0.666667 >>> c 0.285714 0.285714 0.400000 0.25 0.333333 0.333333 >>> d 0.265306 0.285714 0.333333 0.25 0.000000 0.000000 Returns: Conditional Probability Distributions of each node within the Bayesian Network. """ cpds = {} for cpd in self._model.cpds: names = cpd.variables[1:] cols = [""] if names: cols = pd.MultiIndex.from_product( [sorted(self._node_states[var].keys()) for var in names], names=names, ) cpds[cpd.variable] = pd.DataFrame( cpd.values.reshape( len(self._node_states[cpd.variable]), max(1, len(cols)) ) ) cpds[cpd.variable][cpd.variable] = sorted( self._node_states[cpd.variable].keys() ) cpds[cpd.variable].set_index([cpd.variable], inplace=True) cpds[cpd.variable].columns = cols return cpds def set_cpd(self, node: str, df: pd.DataFrame) -> "BayesianNetwork": """ Provide self-defined CPD to Bayesian Network Args: node: the node to add self-defined cpd. df: self-defined cpd in pandas DataFrame format. Returns: self Raises: IndexError: if the index names of the pandas DataFrame does not match the expected DataFrame. ValueError: if node does not exist in Bayesian Network or a bad cpd table is provided. """ if node not in self.nodes: raise ValueError(f'Non-existing node "{node}"') # Check Table true_parents = { parent_node: self.node_states[parent_node] for parent_node in self._structure.predecessors(node) } table_parents = { name: set(df.columns.levels[i].values) for i, name in enumerate(df.columns.names) } if not ( set(df.index.values) == self.node_states[node] and true_parents == table_parents and df.index.name == node ): raise IndexError("Wrong index values. Please check your indices") sorted_df = df.reindex(sorted(df.columns), axis=1) node_card = len(self.node_states[node]) evidence, evidence_card = zip( *[(key, len(table_parents[key])) for key in sorted(table_parents.keys())] ) tabular_cpd = TabularCPD( node, node_card, sorted_df.values, evidence=evidence, evidence_card=evidence_card, ) model_copy = copy.deepcopy(self._model) model_copy.add_cpds(tabular_cpd) model_copy.check_model() self._model = model_copy return self def fit_node_states(self, df: pd.DataFrame) -> "BayesianNetwork": """ Fit all states of nodes that can appear in the data. The dataframe provided should contain every possible state (values that can be taken) for every column. Args: df: data to fit node states from. Each column indicates a node and each row an observed combination of states. Returns: self Raises: ValueError: if dataframe contains any missing data. """ self.node_states = {c: set(df[c].unique()) for c in df.columns} return self def _state_to_index( self, df: pd.DataFrame, nodes: Optional[List[str]] = None, ) -> pd.DataFrame: """ Transforms all values in df to an integer, as defined by the mapping from fit_node_states. Args: df: data to transform nodes: list of nodes to map to index. None means all. Returns: The transformed dataframe. Raises: ValueError: if nodes have not been fit, or if column names do not match node names. """ df.is_copy = False cols = nodes if nodes else df.columns for col in cols: df[col] = df[col].map(self._node_states[col]) df.is_copy = True return df def fit_cpds( self, data: pd.DataFrame, method: str = "MaximumLikelihoodEstimator", bayes_prior: Optional[str] = None, equivalent_sample_size: Optional[int] = None, ) -> "BayesianNetwork": """ Learn conditional probability distributions for all nodes in the Bayesian Network, conditioned on their incoming edges (parents). Args: data: dataframe containing one column per node in the Bayesian Network. method: how to fit probabilities. One of: - "MaximumLikelihoodEstimator": fit probabilities using Maximum Likelihood Estimation; - "BayesianEstimator": fit probabilities using Bayesian Parameter Estimation. Use bayes_prior. bayes_prior: how to construct the Bayesian prior used by method="BayesianEstimator". One of: - "K2": shorthand for dirichlet where all pseudo_counts are 1 regardless of variable cardinality; - "BDeu": equivalent of using Dirichlet and using uniform 'pseudo_counts' of `equivalent_sample_size / (node_cardinality * np.prod(parents_cardinalities))` for each node. Use equivelant_sample_size. equivalent_sample_size: used by BDeu bayes_prior to compute pseudo_counts. Returns: self Raises: ValueError: if an invalid method or bayes_prior is specified. """ state_names = {k: list(v.values()) for k, v in self._node_states.items()} transformed_data = data.copy(deep=True) # type: pd.DataFrame transformed_data = self._state_to_index(transformed_data[self.nodes]) if method == "MaximumLikelihoodEstimator": self._model.fit( data=transformed_data, estimator=MaximumLikelihoodEstimator, state_names=state_names, ) elif method == "BayesianEstimator": valid_bayes_priors = ["BDeu", "K2"] if bayes_prior not in valid_bayes_priors: raise ValueError( f"unrecognised bayes_prior, please use one of {valid_bayes_priors}" ) self._model.fit( data=transformed_data, estimator=BayesianEstimator, prior_type=bayes_prior, equivalent_sample_size=equivalent_sample_size, state_names=state_names, ) else: valid_methods = ["MaximumLikelihoodEstimator", "BayesianEstimator"] raise ValueError(f"unrecognised method, please use one of {valid_methods}") return self def fit_node_states_and_cpds( self, data: pd.DataFrame, method: str = "MaximumLikelihoodEstimator", bayes_prior: Optional[str] = None, equivalent_sample_size: Optional[int] = None, ) -> "BayesianNetwork": """ Call `fit_node_states` and then `fit_cpds`. Args: data: dataframe containing one column per node in the Bayesian Network. method: how to fit probabilities. One of: - "MaximumLikelihoodEstimator": fit probabilities using Maximum Likelihood Estimation; - "BayesianEstimator": fit probabilities using Bayesian Parameter Estimation. Use bayes_prior. bayes_prior: how to construct the Bayesian prior used by method="BayesianEstimator". One of: - "K2": shorthand for dirichlet where all pseudo_counts are 1 regardless of variable cardinality; - "BDeu": equivalent of using dirichlet and using uniform 'pseudo_counts' of `equivalent_sample_size / (node_cardinality * np.prod(parents_cardinalities))` for each node. Use equivelant_sample_size. equivalent_sample_size: used by BDeu bayes_prior to compute pseudo_counts. Returns: self """ return self.fit_node_states(data).fit_cpds( data, method, bayes_prior, equivalent_sample_size ) def add_node( self, node: str, edges_to_add: List[Tuple[str, str]], edges_to_remove: List[Tuple[str, str]], ) -> "BayesianNetwork": """ Adding a latent variable to the structure model, as well as its corresponding edges Args: node: Name of the node edges_to_add: which edges to add to the structure edges_to_remove: which edges to remove from the structure Returns: self Raises: ValueError: If lv_name exists in the network or if `edges_to_add` include edges NOT containing the latent variable or if `edges_to_remove` include edges containing the latent variable """ if any(node not in edges for edges in edges_to_add): raise ValueError(f"Should only add edges containing node '{node}'") if any(node in edges for edges in edges_to_remove): raise ValueError(f"Should only remove edges NOT containing node '{node}'") self._structure.add_edges_from(edges_to_add) self._structure.remove_edges_from(edges_to_remove) self._model.add_edges_from(edges_to_add) self._model.remove_edges_from(edges_to_remove) return self def fit_latent_cpds( # pylint: disable=too-many-arguments self, lv_name: str, lv_states: List, data: pd.DataFrame, box_constraints: Optional[Dict[str, Tuple[pd.DataFrame, pd.DataFrame]]] = None, priors: Optional[Dict[str, pd.DataFrame]] = None, initial_params: Union[str, Dict[str, pd.DataFrame]] = "random", non_missing_data_factor: int = 1, n_runs: int = 20, stopping_delta: float = 0.0, ) -> "BayesianNetwork": """ This runs the EM algorithm to estimate the CPDs of latent variables and their corresponding Markov blanket Args: lv_name: Latent variable name lv_states: the states the LV can assume data: dataframe, must contain all variables in the Markov Blanket of the latent variable. Include one column with the latent variable name, filled with np.nan for missing info about LV. If some data is present about the LV, create complete columns. n_runs: max number of EM alternations stopping_delta: if max difference in current - last iteration CPDS < stopping_delta => convergence reached initial_params: way to initialise parameters. Can be: - "random": random values (default) - "avg": uniform distributions everywhere. Not advised, as it may be the a stationary point on itself - if provide a dictionary of dataframes, this will be used as the initialisation box_constraints: minimum and maximum values for each model parameter. Specified with a dictionary mapping: - Node - two dataframes, in order: Min(P(Node|Par(Node))) and Max(P(Node|Par(Node))) priors: priors, provided as a mapping Node -> dataframe with Dirichilet priors for P(Node|Par(Node)) non_missing_data_factor: This is a weight added to the non-missing data samples. The effect is as if the amount of data provided was bigger. Empirically, helps to set the factor to 10 if the non missing data is ~1% of the dataset Returns: self Raises: ValueError: if the latent variable is not a string or if the latent variable cannot be found in the network or if the latent variable is present/observed in the data if the latent variable states are empty """ if not isinstance(lv_name, str): raise ValueError(f"Invalid latent variable name '{lv_name}'") if lv_name not in self._structure: raise ValueError(f"Latent variable '{lv_name}' not added to the network") if not isinstance(lv_states, list) or len(lv_states) == 0: raise ValueError(f"Latent variable '{lv_name}' contains no states") # Register states for the latent variable self._node_states[lv_name] = {v: k for k, v in enumerate(sorted(lv_states))} # Run EM algorithm estimator = EMSingleLatentVariable( sm=self.structure, data=data, lv_name=lv_name, node_states={n: sorted(s) for n, s in self.node_states.items()}, initial_params=initial_params, box_constraints=box_constraints, priors=priors, non_missing_data_factor=non_missing_data_factor, ) estimator.run(n_runs=n_runs, stopping_delta=stopping_delta) # Add CPDs into the model tab_cpds = [pd_to_tabular_cpd(el) for el in estimator.cpds.values()] self._model.add_cpds(*tab_cpds) return self def predict(self, data: pd.DataFrame, node: str) -> pd.DataFrame: """ Predict the state of a node based on some input data, using the Bayesian Network. Args: data: data to make prediction. node: the node to predict. Returns: A dataframe of predictions, containing a single column name {node}_prediction. """ if all(parent in data.columns for parent in self._model.get_parents(node)): return self._predict_from_complete_data(data, node) return self._predict_from_incomplete_data(data, node) def _predict_from_complete_data( self, data: pd.DataFrame, node: str, ) -> pd.DataFrame: """ Predict state of node given all parents of node exist within data. This method inspects the CPD of node directly, since all parent states are known. This avoids traversing the full network to compute marginals. This method is fast. Args: data: data to make prediction. node: the node to predict. Returns: A dataframe of predictions, containing a single column named {node}_prediction. """ transformed_data = data.copy(deep=True) # type: pd.DataFrame parents = sorted(self._model.get_parents(node)) cpd = self.cpds[node] transformed_data[f"{node}_prediction"] = transformed_data.apply( lambda row: cpd[tuple(row[parent] for parent in parents)].idxmax() if parents else cpd[""].idxmax(), axis=1, ) return transformed_data[[node + "_prediction"]] def _predict_from_incomplete_data( self, data: pd.DataFrame, node: str, ) -> pd.DataFrame: """ Predict state of node when some parents of node do not exist within data. This method uses the pgmpy predict function, which predicts the most likely state for every node that is not contained within data. With incomplete data, pgmpy goes beyond parents in the network to determine the most likely predictions. This method is slow. Args: data: data to make prediction. node: the node to predict. Returns: A dataframe of predictions, containing a single column name {node}_prediction. """ transformed_data = data.copy(deep=True) # type: pd.DataFrame self._state_to_index(transformed_data) # pgmpy will predict all missing data, so drop column we want to predict transformed_data = transformed_data.drop(columns=[node]) predictions = self._model.predict(transformed_data)[[node]] return predictions.rename(columns={node: node + "_prediction"}) def predict_probability(self, data: pd.DataFrame, node: str) -> pd.DataFrame: """ Predict the probability of each possible state of a node, based on some input data. Args: data: data to make prediction. node: the node to predict probabilities. Returns: A dataframe of predicted probabilities, contained one column per possible state, named {node}_{state}. """ if all(parent in data.columns for parent in self._model.get_parents(node)): return self._predict_probability_from_complete_data(data, node) return self._predict_probability_from_incomplete_data(data, node) def _predict_probability_from_complete_data( self, data: pd.DataFrame, node: str, ) -> pd.DataFrame: """ Predict the probability of each possible state of a node, based on some input data. This method inspects the CPD of node directly, since all parent states are known. This avoids traversing the full network to compute marginals. This method is fast. Args: data: data to make prediction. node: the node to predict probabilities. Returns: A dataframe of predicted probabilities, contained one column per possible state, named {node}_{state}. """ transformed_data = data.copy(deep=True) # type: pd.DataFrame parents = sorted(self._model.get_parents(node)) cpd = self.cpds[node] def lookup_probability(row, s): """Retrieve probability from CPD""" if parents: return cpd[tuple(row[parent] for parent in parents)].loc[s] return cpd.at[s, ""] for state in self.node_states[node]: transformed_data[f"{node}_{state}"] = transformed_data.apply( lambda row, st=state: lookup_probability(row, st), axis=1 ) return transformed_data[[f"{node}_{state}" for state in self.node_states[node]]] def _predict_probability_from_incomplete_data( self, data: pd.DataFrame, node: str, ) -> pd.DataFrame: """ Predict the probability of each possible state of a node, based on some input data. This method uses the pgmpy predict_probability function, which predicts the probability of every state for every node that is not contained within data. With incomplete data, pgmpy goes beyond parents in the network to determine the most likely predictions. This method is slow. Args: data: data to make prediction. node: the node to predict probabilities. Returns: A dataframe of predicted probabilities, contained one column per possible state, named {node}_{state}. """ transformed_data = data.copy(deep=True) # type: pd.DataFrame self._state_to_index(transformed_data) # pgmpy will predict all missing data, so drop column we want to predict transformed_data = transformed_data.drop(columns=[node]) probability = self._model.predict_probability( transformed_data ) # type: pd.DataFrame # keep only probabilities for the node we are interested in cols = [] pattern = re.compile(f"^{node}_[0-9]+$") # disabled open pylint issue (https://github.com/PyCQA/pylint/issues/2962) for col in probability.columns: if pattern.match(col): cols.append(col) probability = probability[cols] probability.columns = cols return probability
import pandas as pd import bayespy import numpy as np from pgmpy.models import BayesianModel model = BayesianModel([("bitcoin","cryptocurrency"), ("trump","inflation"), ("bitcoin","sentiment"), ("federal","tax"), ("federal","bank"), ("federal","trade"), ("federal","inflation"), ("tax","bank"), ("bank","inflation"), ("cryptocurrency","investor"), ("investor","stock"), ("oil","bank")]) data = pd.read_csv("vectors.csv") data_train = data[: int(data.shape[0] * 0.5)] model.fit(data_train) model.get_cpds() data_test = data[int(0.5 * data.shape[0]) : data.shape[0]] y_test = np.array(data_test.ix[:,11]) data_test = data_test.ix[:,range(11)] y_pre = np.array(model.predict(data_test)) count = 0 n = y_test.size for i in range(n): if y_test[i] == y_pre[i]: count += 1 print(count/n)
print '\nConditional probability distributions of model:' print model.get_cpds() ############################################################################################## # Make Predictions from Bayesian Model ############################################################################################## # get truth data and drop from test set, needed to predict missing data y_test = test['Accident_Severity'] predict_data = test.drop(columns='Accident_Severity') # try to load predictions my_file = Path(args.checkpoint_dir + 'predictions.pkl') if my_file.exists(): with open(args.checkpoint_dir + 'predictions.pkl', 'rb') as input: predict = pickle.load(input) else: # perform variable elimination print('Performing Variable Elimination...') model_inference = VariableElimination(model, state_names=possible_values) # predict using model print('Predicting...') predict = model.predict(predict_data) # save predictions so we don't have to make them again with open(args.checkpoint_dir + 'predictions.pkl', 'wb') as output: pickle.dump(predict, output, pickle.HIGHEST_PROTOCOL) # evaluate predictions evaluate(predict, y_test)
class TestBayesianModelFitPredict(unittest.TestCase): def setUp(self): self.model_disconnected = BayesianModel() self.model_disconnected.add_nodes_from(['A', 'B', 'C', 'D', 'E']) self.model_connected = BayesianModel([('A', 'B'), ('C', 'B'), ('C', 'D'), ('B', 'E')]) self.model2 = BayesianModel([('A', 'C'), ('B', 'C')]) self.data1 = pd.DataFrame(data={'A': [0, 0, 1], 'B': [0, 1, 0], 'C': [1, 1, 0]}) self.data2 = pd.DataFrame(data={'A': [0, np.NaN, 1], 'B': [0, 1, 0], 'C': [1, 1, np.NaN], 'D': [np.NaN, 'Y', np.NaN]}) # data_link - "https://www.kaggle.com/c/titanic/download/train.csv" self.titanic_data = pd.read_csv('pgmpy/tests/test_estimators/testdata/titanic_train.csv', dtype=str) self.titanic_data2 = self.titanic_data[["Survived", "Sex", "Pclass"]] def test_bayesian_fit(self): print(isinstance(BayesianEstimator, BaseEstimator)) print(isinstance(MaximumLikelihoodEstimator, BaseEstimator)) self.model2.fit(self.data1, estimator=BayesianEstimator, prior_type="dirichlet", pseudo_counts=[9, 3]) self.assertEqual(self.model2.get_cpds('B'), TabularCPD('B', 2, [[11.0 / 15], [4.0 / 15]])) def test_fit_missing_data(self): self.model2.fit(self.data2, state_names={'C': [0, 1]}, complete_samples_only=False) cpds = set([TabularCPD('A', 2, [[0.5], [0.5]]), TabularCPD('B', 2, [[2. / 3], [1. / 3]]), TabularCPD('C', 2, [[0, 0.5, 0.5, 0.5], [1, 0.5, 0.5, 0.5]], evidence=['A', 'B'], evidence_card=[2, 2])]) self.assertSetEqual(cpds, set(self.model2.get_cpds())) def test_disconnected_fit(self): values = pd.DataFrame(np.random.randint(low=0, high=2, size=(1000, 5)), columns=['A', 'B', 'C', 'D', 'E']) self.model_disconnected.fit(values) for node in ['A', 'B', 'C', 'D', 'E']: cpd = self.model_disconnected.get_cpds(node) self.assertEqual(cpd.variable, node) np_test.assert_array_equal(cpd.cardinality, np.array([2])) value = (values.ix[:, node].value_counts() / values.ix[:, node].value_counts().sum()) value = value.reindex(sorted(value.index)).values np_test.assert_array_equal(cpd.values, value) def test_predict(self): titanic = BayesianModel() titanic.add_edges_from([("Sex", "Survived"), ("Pclass", "Survived")]) titanic.fit(self.titanic_data2[500:]) p1 = titanic.predict(self.titanic_data2[["Sex", "Pclass"]][:30]) p2 = titanic.predict(self.titanic_data2[["Survived", "Pclass"]][:30]) p3 = titanic.predict(self.titanic_data2[["Survived", "Sex"]][:30]) p1_res = np.array(['0', '1', '0', '1', '0', '0', '0', '0', '0', '1', '0', '1', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0']) p2_res = np.array(['male', 'female', 'female', 'female', 'male', 'male', 'male', 'male', 'female', 'female', 'female', 'female', 'male', 'male', 'male', 'female', 'male', 'female', 'male', 'female', 'male', 'female', 'female', 'female', 'male', 'female', 'male', 'male', 'female', 'male']) p3_res = np.array(['3', '1', '1', '1', '3', '3', '3', '3', '1', '1', '1', '1', '3', '3', '3', '1', '3', '1', '3', '1', '3', '1', '1', '1', '3', '1', '3', '3', '1', '3']) np_test.assert_array_equal(p1.values.ravel(), p1_res) np_test.assert_array_equal(p2.values.ravel(), p2_res) np_test.assert_array_equal(p3.values.ravel(), p3_res) def test_connected_predict(self): np.random.seed(42) values = pd.DataFrame(np.array(np.random.randint(low=0, high=2, size=(1000, 5)), dtype=str), columns=['A', 'B', 'C', 'D', 'E']) fit_data = values[:800] predict_data = values[800:].copy() self.model_connected.fit(fit_data) self.assertRaises(ValueError, self.model_connected.predict, predict_data) predict_data.drop('E', axis=1, inplace=True) e_predict = self.model_connected.predict(predict_data) np_test.assert_array_equal(e_predict.values.ravel(), np.array([1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0], dtype=str)) def test_connected_predict_probability(self): np.random.seed(42) values = pd.DataFrame(np.random.randint(low=0, high=2, size=(100, 5)), columns=['A', 'B', 'C', 'D', 'E']) fit_data = values[:80] predict_data = values[80:].copy() self.model_connected.fit(fit_data) predict_data.drop('E', axis=1, inplace=True) e_prob = self.model_connected.predict_probability(predict_data) np_test.assert_allclose(e_prob.values.ravel(), np.array([0.57894737, 0.42105263, 0.57894737, 0.42105263, 0.57894737, 0.42105263, 0.5 , 0.5 , 0.57894737, 0.42105263, 0.5 , 0.5 , 0.57894737, 0.42105263, 0.57894737, 0.42105263, 0.57894737, 0.42105263, 0.5 , 0.5 , 0.57894737, 0.42105263, 0.57894737, 0.42105263, 0.5 , 0.5 , 0.57894737, 0.42105263, 0.57894737, 0.42105263, 0.5 , 0.5 , 0.57894737, 0.42105263, 0.5 , 0.5 , 0.5 , 0.5 , 0.5 , 0.5 ]), atol = 0) predict_data = pd.DataFrame(np.random.randint(low=0, high=2, size=(1, 5)), columns=['A', 'B', 'C', 'F', 'E'])[:] def test_predict_probability_errors(self): np.random.seed(42) values = pd.DataFrame(np.random.randint(low=0, high=2, size=(2, 5)), columns=['A', 'B', 'C', 'D', 'E']) fit_data = values[:1] predict_data = values[1:].copy() self.model_connected.fit(fit_data) self.assertRaises(ValueError, self.model_connected.predict_probability, predict_data) predict_data = pd.DataFrame(np.random.randint(low=0, high=2, size=(1, 5)), columns=['A', 'B', 'C', 'F', 'E'])[:] self.assertRaises(ValueError, self.model_connected.predict_probability, predict_data) def tearDown(self): del self.model_connected del self.model_disconnected