Beispiel #1
0
 def test_score_titanic(self):
     scorer = BicScore(self.titanic_data2)
     titanic = BayesianModel([("Sex", "Survived"), ("Pclass", "Survived")])
     self.assertAlmostEqual(scorer.score(titanic), -1896.7250012840179)
     titanic2 = BayesianModel([("Pclass", "Sex")])
     titanic2.add_nodes_from(["Sex", "Survived", "Pclass"])
     self.assertLess(scorer.score(titanic2), scorer.score(titanic))
    def pgm_generate(self, target, data, pgm_stats, subnodes, child=None):

        subnodes = [str(int(node)) for node in subnodes]
        target = str(int(target))
        subnodes_no_target = [node for node in subnodes if node != target]
        data.columns = data.columns.astype(str)

        MK_blanket = self.search_MK(data, target, subnodes_no_target.copy())

        if child == None:
            est = HillClimbSearch(data[subnodes_no_target],
                                  scoring_method=BicScore(data))
            pgm_no_target = est.estimate()
            for node in MK_blanket:
                if node != target:
                    pgm_no_target.add_edge(node, target)

        #   Create the pgm
            pgm_explanation = BayesianModel()
            for node in pgm_no_target.nodes():
                pgm_explanation.add_node(node)
            for edge in pgm_no_target.edges():
                pgm_explanation.add_edge(edge[0], edge[1])

        #   Fit the pgm
            data_ex = data[subnodes].copy()
            data_ex[target] = data[target].apply(self.generalize_target)
            for node in subnodes_no_target:
                data_ex[node] = data[node].apply(self.generalize_others)
            pgm_explanation.fit(data_ex)
        else:
            data_ex = data[subnodes].copy()
            data_ex[target] = data[target].apply(self.generalize_target)
            for node in subnodes_no_target:
                data_ex[node] = data[node].apply(self.generalize_others)

            est = HillClimbSearch(data_ex, scoring_method=BicScore(data_ex))
            pgm_w_target_explanation = est.estimate()

            #   Create the pgm
            pgm_explanation = BayesianModel()
            for node in pgm_w_target_explanation.nodes():
                pgm_explanation.add_node(node)
            for edge in pgm_w_target_explanation.edges():
                pgm_explanation.add_edge(edge[0], edge[1])

            #   Fit the pgm
            data_ex = data[subnodes].copy()
            data_ex[target] = data[target].apply(self.generalize_target)
            for node in subnodes_no_target:
                data_ex[node] = data[node].apply(self.generalize_others)
            pgm_explanation.fit(data_ex)

        return pgm_explanation
Beispiel #3
0
def model_change(dag, data):
    bay_model = []
    data = pd.DataFrame(data)
    print(data)
    for i in range(len(dag)):
        for j in range(np.size(data, 1)):
            if dag[i][j] != 0:
                bay_model.append((str(i), str(j)))
    best_model = BayesianModel(bay_model)
    bic = BicScore(data)
    return bic.score(best_model)
def train_joke_type_selection():
    #one table
    jokes = Jokes.query.all()
    joke_preferences = []
    for i in range(
            sum([joke_preference.nerd_joke for joke_preference in jokes])):
        joke_preferences.append("nerd joke")
    for i in range(sum([joke.weird_joke for joke in jokes])):
        joke_preferences.append("weird joke")
    for i in range(sum([joke.cat_meme for joke in jokes])):
        joke_preferences.append("cat meme")
    for i in range(sum([joke.dog_meme for joke in jokes])):
        joke_preferences.append("dog meme")
    for i in range(sum([joke.dad_joke for joke in jokes])):
        joke_preferences.append("dad joke")
    data = pd.DataFrame()
    for joke_preference in joke_preferences:
        data = data.append({"joke_preference": joke_preference},
                           ignore_index=True)

    bic = BicScore(data)
    import code
    code.interact(local=locals())
    es = ExhaustiveSearch(data, scoring_method=bic)
    best_model = es.estimate()
    return best_model
Beispiel #5
0
    def pgm_generate(self, target, data, stats, subnodes):
        stats_pd = pd.Series(stats, name='p-values')
        MK_blanket_frame = stats_pd[stats_pd < 0.05]
        MK_blanket = [node for node in MK_blanket_frame.index if node in subnodes]
        subnodes_no_target = [node for node in subnodes if node != target]
        est = HillClimbSearch(data[subnodes_no_target], scoring_method=BicScore(data))
        pgm_no_target = est.estimate()
        for node in MK_blanket:
            if node != target:
                pgm_no_target.add_edge(node,target)

    #   Create the pgm    
        pgm_explanation = BayesianModel()
        for node in pgm_no_target.nodes():
            pgm_explanation.add_node(node)
        for edge in pgm_no_target.edges():
            pgm_explanation.add_edge(edge[0],edge[1])

    #   Fit the pgm
        data_ex = data[subnodes].copy()
        data_ex[target] = data[target].apply(self.generalize_target)
        for node in subnodes_no_target:
            data_ex[node] = data[node].apply(self.generalize_others)
        pgm_explanation.fit(data_ex)

        return pgm_explanation
Beispiel #6
0
def mutacao(x, fitness_aux, prob, max_v, min_v):
    if len(x) * len(x[0]) * prob < 1:
        print("entando")
        for i in range(len(x)):
            for j in range(len(x[i])):
                r = random.random()
                if r <= prob:
                    valor_mut = x[i][j]
                    while (valor_mut == x[i][j]):
                        valor_mut = min_v + random.randint(
                            min_valor, max_valor)
                    x[i][j] = valor_mut
    else:
        numero_mutacao = round(len(x) * len(x[0]) * prob)
        while (numero_mutacao > 0):
            ind_escolhido = round(random.random() * (len(x) - 1))
            val = round(random.random() * (len(x[ind_escolhido]) - 1))
            valor_mut = deepcopy(x[ind_escolhido][val])
            valor_mut_antigo = deepcopy(x[ind_escolhido][val])
            while (valor_mut == x[ind_escolhido][val]):
                valor_mut = min_v + random.randint(min_valor, max_valor)
            x[ind_escolhido][val] = valor_mut
            if x[ind_escolhido][val] not in nao_dag:
                G = vetor_Rede(x[ind_escolhido], nodes)
                if G:
                    fitness_aux[ind_escolhido] = abs(BicScore(data).score(G))
                    numero_mutacao = numero_mutacao - 1
                else:
                    nao_dag.append(x[ind_escolhido])
                    x[ind_escolhido][val] = valor_mut_antigo
            else:
                x[ind_escolhido][val] = valor_mut_antigo
def learnedStructureModel():
    # trainingData, testingData = differenceBetweenFeatures(True)
    trainingInputs, trainingOutputs, testingInputs, testingOutputs = \
     gtd.formSameWriterDiffWriterInputOutputFeaturePairs(5, True)

    trainingData = pd.DataFrame(
     data = np.concatenate((trainingInputs, trainingOutputs), axis=1),
     columns=['f1','f2','f3','f4','f5','f6','f7','f8','f9',\
      'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19',
      'h'])

    testingData = pd.DataFrame(
     data = np.concatenate((testingInputs, testingOutputs), axis=1),
     columns=['f1','f2','f3','f4','f5','f6','f7','f8','f9',\
      'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19',
      'h'])

    #trainingData = trainingData.drop(['f9', 'f18'], axis=1)
    #testingData = testingData.drop(['f9', 'f18'], axis=1)

    hc = HillClimbSearch(trainingData, scoring_method=BicScore(trainingData))
    model = hc.estimate(max_indegree=20)

    state_names = {
        'f1': [0, 1, 2, 3],
        'f2': [0, 1, 2, 3, 4],
        'f3': [0, 1, 2],
        'f4': [0, 1, 2, 3, 4],
        'f5': [0, 1, 2, 3],
        'f6': [0, 1, 2, 3],
        'f7': [0, 1, 2, 3],
        'f8': [0, 1, 2, 3, 4],
        'f9': [0, 1, 2],
        'f11': [0, 1, 2, 3],
        'f12': [0, 1, 2, 3, 4],
        'f13': [0, 1, 2],
        'f14': [0, 1, 2, 3, 4],
        'f15': [0, 1, 2, 3],
        'f16': [0, 1, 2, 3],
        'f17': [0, 1, 2, 3],
        'f18': [0, 1, 2, 3, 4],
        'f19': [0, 1, 2],
        'h': [0, 1]
    }

    # fit model and data, compute CPDs
    model.fit(trainingData,
              estimator=BayesianEstimator,
              prior_type='BDeu',
              state_names=state_names)

    print(model.edges())

    # inference object
    # computing probability of Hyothesis given evidence
    evidenceNodes = ['f1','f2','f3','f4','f5','f6','f7','f8','f9',\
      'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19']
    evaluateModel(model, testingData, 'h', evidenceNodes)
Beispiel #8
0
def build_structure(data):
    df = pd.DataFrame(data)
    est = HillClimbSearch(df, scoring_method=BicScore(df))
    model = est.estimate()
    DAG = np.zeros((data.shape[1], data.shape[1]), np.int64)

    for edge in model.edges():
        DAG[edge[0], edge[1]] = 1

    np.save('dataset/DAG.npy', DAG)
    return DAG
Beispiel #9
0
def _SetScoringType(df, scoretype, verbose=3):
    if verbose >= 3: print('[bnlearn] >Set scoring type at [%s]' % (scoretype))

    if scoretype == 'bic':
        scoring_method = BicScore(df)
    elif scoretype == 'k2':
        scoring_method = K2Score(df)
    elif scoretype == 'bdeu':
        scoring_method = BDeuScore(df, equivalent_sample_size=5)

    return (scoring_method)
Beispiel #10
0
def main():
    data, string = readData()
    genes = np.array(data.columns[1:])
    labels = np.array(data.columns)

    bayesianModel = BayesianModel()
    transitionModel = DBN()

    bayesianModel.add_nodes_from(genes)
    transitionModel.add_nodes_from(genes)

    bData, tData = getData(data, labels)
    
    print "\nDynamic Bayesian Network inference", 
    print "\nB_0 network relations:  "
    
    hcb = HillClimbSearch(bData, genes, scoring_method=BicScore(bData, labels, bk1=string, weight=4))
    best_model_b = hcb.estimate(start=bayesianModel, tabu_length=15, max_indegree=2)
    print(best_model_b.edges())

    printOutputB(best_model_b)

    print "\nLocal Probability Model: "
    best_model_b.fit(bData, BayesianEstimator)
    for cpd in best_model_b.get_cpds():
        print(cpd)

    print "\nB_transition network relations: "

    hct = HillClimbSearch(tData, genes, scoring_method=BicScore(tData, labels, bk1=string, weight=4))
    best_model_t = hct.estimate_dynamic(start=transitionModel, tabu_length=15, max_indegree=2)
    print(best_model_t.edges())

    printOutputT(best_model_t)

    print "\nLocal Probability Model: "
    best_model_t.fit(tData, BayesianEstimator)
    for cpd in best_model_t.get_cpds():
        print(cpd)
Beispiel #11
0
def SetScoringType(df, scoretype, verbose=3):
    if verbose >= 3:
        print('[BNLEARN][STRUCTURE LEARNING] Set scoring type at [%s]' %
              (scoretype))

    if scoretype == 'bic':
        scoring_method = BicScore(df)
    elif scoretype == 'k2':
        scoring_method = K2Score(df)
    elif scoretype == 'bdeu':
        scoring_method = BdeuScore(df, equivalent_sample_size=5)

    return (scoring_method)
Beispiel #12
0
def bei_ye_si():
    warnings.filterwarnings("ignore")
    print('现在进行的算法是贝叶斯网络')
    f = open('泰坦尼克号.txt')
    dataset = pd.read_table(f, delim_whitespace=True)
    train = dataset[:800]
    test = dataset[800:]
    hc = HillClimbSearch(train, scoring_method=BicScore(train))
    best_model = hc.estimate()
    best_model.fit(train, estimator=BayesianEstimator,
                   prior_type="BDeu")  # default equivalent_sample_size=5
    predict_data = test.drop(columns=['Survived'], axis=1)
    y_pred = best_model.predict(predict_data)
    print(
        (y_pred['Survived'] == test['Survived']).sum() / len(test))  # 测试集精度'''
    def setUp(self):
        self.rand_data = pd.DataFrame(np.random.randint(0, 5, size=(5000, 2)),
                                      columns=list('AB'))
        self.rand_data['C'] = self.rand_data['B']
        self.est_rand = ExhaustiveSearch(self.rand_data)
        self.est_rand_bdeu = ExhaustiveSearch(self.rand_data,
                                              scoring_method=BdeuScore(
                                                  self.rand_data))
        self.est_rand_bic = ExhaustiveSearch(self.rand_data,
                                             scoring_method=BicScore(
                                                 self.rand_data))

        # link to dataset: "https://www.kaggle.com/c/titanic/download/train.csv"
        self.titanic_data = pd.read_csv(
            'pgmpy/tests/test_estimators/testdata/titanic_train.csv')
        self.titanic_data2 = self.titanic_data[["Survived", "Sex", "Pclass"]]
        self.est_titanic = ExhaustiveSearch(self.titanic_data2)
Beispiel #14
0
def scoreStructureLearn(data,
                        search='HillClimbSearch',
                        scoring_method='BicScore'):
    #基于score-search的结构学习
    #search:HillClimbSearch, ExhaustiveSearch
    #scoring_method: 'BicScore', K2Score, BdeuScore
    if scoring_method == 'BicScore':
        scoring_method_tmp = BicScore(data)
    elif scoring_method == 'K2Score':
        scoring_method_tmp = K2Score(data)
    elif scoring_method == 'BdeuScore':
        scoring_method_tmp = BdeuScore(data, equivalent_sample_size=5)
    if search == 'HillClimbSearch':
        es = HillClimbSearch(data, scoring_method=scoring_method_tmp)
    else:
        es = ExhaustiveSearch(data, scoring_method=scoring_method_tmp)
    best_model = es.estimate()
    return best_model
Beispiel #15
0
    def learn_structure(self, method, scoring_method, log=True):
        ''' (4)
        Method that builds the structure of the data
        -----------------
        Parameters:
        method          : The technique used to search for the structure
            -> scoring_approx     - To use an approximated search with scoring method
            -> scoring_exhaustive - To use an exhaustive search with scoring method
            -> constraint         - To use the constraint based technique
        scoring_method : K2, bic, bdeu
        log             - "True" if you want to print debug information in the console    
        '''

        #Select the scoring method for the local search of the structure
        if scoring_method == "K2":
            scores = K2Score(self.data)
        elif scoring_method == "bic":
            scores = BicScore(self.data)
        elif scoring_method == "bdeu":
            scores = BdeuScore(self.data)

        #Select the actual method
        if method == "scoring_approx":
            est = HillClimbSearch(self.data, scores)
        elif method == "scoring_exhaustive":
            est = ExhaustiveSearch(self.data, scores)
        elif method == "constraint":
            est = ConstraintBasedEstimator(self.data)

        self.best_model = est.estimate()
        self.eliminate_isolated_nodes(
        )  # REMOVE all nodes not connected to anything else

        for edge in self.best_model.edges_iter():
            self.file_writer.write_txt(str(edge))

        self.log("Method used for structural learning: " + method, log)
        #self.log("Training instances skipped: " + str(self.extractor.get_skipped_lines()), log)
        self.log("Search terminated", log)
def scoreModels(h0Diff, h0Rarity):
	diffModel0 = [('d5', 'd9'), ('d5', 'd3'), ('d3', 'd4'), ('d3', 'd8'), 
				  ('d9', 'd6'), ('d9', 'd1'), ('d9', 'd7'), ('d9', 'd8')]

	diffModel1 = [('d2', 'd5'), ('d5', 'd9'), ('d5', 'd3'), ('d3', 'd4'),
				  ('d3', 'd8'), ('d9', 'd6'), ('d9', 'd1'), ('d9', 'd7'),
				  ('d9', 'd8')]

	diffModel2 = [('d1', 'd2'), ('d5', 'd9'), ('d5', 'd3'), ('d3', 'd4'),
				  ('d3', 'd8'), ('d9', 'd6'), ('d9', 'd1'), ('d9', 'd7'),
				  ('d9', 'd8')]

	print(' \nestimating K2/BIC score of difference structures\n')
	print('k2score model0: {0}		BicScore model0: {1}'.format(
		K2Score(h0Diff).score(BayesianModel(diffModel0)),
		BicScore(h0Diff).score(BayesianModel(diffModel0))))
	print('k2score model1: {0}		BicScore model1: {1}'.format(
		K2Score(h0Diff).score(BayesianModel(diffModel1)),
		BicScore(h0Diff).score(BayesianModel(diffModel1))))
	print('k2score model2: {0}		BicScore model2: {1}'.format(
		K2Score(h0Diff).score(BayesianModel(diffModel2)),
		BicScore(h0Diff).score(BayesianModel(diffModel2))))

	rarityModel0 = [('r5', 'r9'), ('r5', 'r3'), ('r9', 'r1'), ('r8', 'r3'),
					('r6', 'r9'), ('r6', 'r3')]


	rarityModel1 = [('r6', 'r9'), ('r7', 'r9'), ('r3', 'r4'), ('r3', 'r5'),
					('r3', 'r9'), ('r2', 'r9'), ('r5', 'r9'), ('r9', 'r8'),
					('r9', 'r1')]

	rarityModel2 = [('r7', 'r9'), ('r4', 'r3'), ('r4', 'r9'), ('r1', 'r2'),
					('r1', 'r9'), ('r2', 'r9'), ('r5', 'r9'), ('r9', 'r8'),
					('r9', 'r6')]

	print(' \nestimating K2/BIC score of rarity structures\n')
	print('k2score model0: {0}		BicScore model0: {1}'.format(
		K2Score(h0Rarity).score(BayesianModel(rarityModel0)),
		BicScore(h0Rarity).score(BayesianModel(rarityModel0))))
	print('k2score model1: {0}		BicScore model1: {1}'.format(
		K2Score(h0Rarity).score(BayesianModel(rarityModel1)),
		BicScore(h0Rarity).score(BayesianModel(rarityModel1))))
	print('k2score model2: {0}		BicScore model2: {1}'.format(
		K2Score(h0Rarity).score(BayesianModel(rarityModel2)),
		BicScore(h0Rarity).score(BayesianModel(rarityModel2))))
        'Cancer': cancer,
        'Age': age,
        'Tuberculose': tuberculosis,
        'TbOuCa': tbOrCa,
        'VisiteAsie': visitAsia,
        'Radiographie': xray,
        'Bronchite': bronchitis,
        'Dyspnea': dyspnea,
        'Geographie': geographical
    })
print(data)

#Apprentissage de la structure
from pgmpy.estimators import HillClimbSearch, BicScore

bic = BicScore(data)
hc = HillClimbSearch(data, scoring_method=bic)
best_model = hc.estimate()
print(best_model.edges())
# la relecture de la structure trouvée révèle que le programme donne les liaisons mais pas le sens de ces dernières.
# le model avec le bon sens serait donc :
bon_model = BayesianModel([('Cancer', 'TbOuCa'), ('TbOuCa', 'Dyspnea'),
                           ('TbOuCa', 'Bronchite'), ('TbOuCa', 'Radiographie'),
                           ('Fumeur', 'Bronchite'),
                           ('Radiographie', 'Dyspnea'),
                           ('Tuberculose', 'TbOuCa'),
                           ('Bronchite', 'Dyspnea')])

#apprentissage des paramètres
#print("estimation des cpds :")
from pgmpy.estimators import BayesianEstimator
data2 = pd.DataFrame(data=raw_data2)

import time

t0 = time.time()
# Uncomment below to perform exhaustive search
searcher = ExhaustiveSearch(data2, scoring_method=K2Score(data2))
search = searcher.all_scores()
print('time:', time.time() - t0)

# Uncomment for printout:
#for score, model in search:
#    print("{0}        {1}".format(score, model.edges()))

separator()

hcs = HillClimbSearch(data2, scoring_method=K2Score(data))
model = hcs.estimate()

hcs2 = HillClimbSearch(data2, scoring_method=K2Score(data2))
model2 = hcs2.estimate()

hcs_bic = HillClimbSearch(data, scoring_method=BicScore(data))
model_bic = hcs_bic.estimate()

hcs_bic2 = HillClimbSearch(data2, scoring_method=BicScore(data2))
model_bic2 = hcs_bic2.estimate()

# End of Task 6
Beispiel #19
0
# 时间:2020/12/21  15:38
import pandas as pd
import networkx as nx
from matplotlib import pyplot as plt
from pgmpy.models import BayesianModel
from pgmpy.estimators import HillClimbSearch
from pgmpy.estimators import BicScore

data = pd.read_csv(
    r'C:\Users\haomiaowu\Desktop\BN-Cheminformatics\Train-clear.csv')
bic = BicScore(data)

hs = HillClimbSearch(data, scoring_method=BicScore(data))
best_model = hs.estimate()
print(best_model.edges())

nx.draw(
    best_model,
    with_labels=True,
    node_size=1000,
    font_weight='bold',
    node_color='y',
)

plt.show()
Beispiel #20
0
def annealing(maxsteps=1000, debug=True):
    """ Optimize the black-box function 'cost_function' with the simulated annealing algorithm."""
    #Ler data
    with open('Asia.csv') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        aux = 0
        data = []
        data1 = [[] for i in range(8)]
        for row in csv_reader:
            data.append(row)
            for i in range(len(row)):
                data1[i].append(row[i])
            aux = aux + 1
            if aux == 50001:
                break

    data = {}
    for i in range(len(data1)):
        data[data1[i][0]] = [data1[i][j] for j in range(1, len(data1[i]))]
    data = pd.DataFrame(data)
    print("Data: ")
    print(data)  #Dados Retirandos do arquivo
    prob = 0.5
    min_valor = 0
    max_valor = 2
    nao_dag = []
    nodes = ['Pollution', 'Smoker', 'Cancer', 'Xray', 'Dyspnoea']
    nodes = ['asia', 'tub', 'smoke', 'lung', 'bronc', 'either', 'xray', 'dysp']
    ind_size = round((len(nodes) * len(nodes) - len(nodes)) / 2)
    ind = False
    while ind == False:
        aux = [random.randint(min_valor, max_valor) for i in range(ind_size)]
        if aux not in nao_dag:
            G = vetor_Rede(aux, nodes)
            if G:
                state = deep_copy(aux)
                ind = True
            else:
                nao_dag.append(aux)
    print('state')
    print(state)
    bic_score = BicScore(data)
    print(vetor_Rede(state, nodes))
    cost = cost_function(state, bic_score, nodes)
    states, costs = [state], [cost]
    for step in range(maxsteps):
        print(step)
        fraction = step / float(maxsteps)
        T = temperature(fraction)
        #[new_state,new_cost]=pertubacao(deep_copy(state),deep_copy(cost),prob,max_valor,min_valor,bic_score,nodes,nao_dag)
        [new_state,
         new_cost] = mutacao(deep_copy(state), deep_copy(cost), prob,
                             max_valor, min_valor, bic_score, nodes, nao_dag)
        #new_cost = cost_function(new_state,bic_score,nodes)
        #if debug: print("Step #{:>2}/{:>2} : T = {:>4.3g}, state = {:>4.3g}, cost = {:>4.3g}, new_state = {:>4.3g}, new_cost = {:>4.3g} ...".format(step, maxsteps, T, state, cost, new_state, new_cost))

        if acceptance_probability(cost, new_cost, T) > random.random():

            state1 = new_state.copy()
            cost = deep_copy(new_cost)

            states.append(state1)
            costs.append(cost)
            state = deep_copy(state1)
            # print("  ==> Accept it!")
        # else:
        #    print("  ==> Reject it...")
    return state, cost_function(state, bic_score, nodes), states, costs
Beispiel #21
0
state, c, states, costs = annealing(maxsteps=3000, debug=True)
nodes = ['asia', 'tub', 'smoke', 'lung', 'bronc', 'either', 'xray', 'dysp']
G = vetor_Rede(state, nodes)
nx.draw(G, with_labels=True)
print(state)
print(c)
with open('Asia.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    aux = 0
    data = []
    data1 = [[] for i in range(8)]
    for row in csv_reader:
        data.append(row)
        for i in range(len(row)):
            data1[i].append(row[i])
        aux = aux + 1
        if aux == 50001:
            break
#22376.39851240954
data = {}
for i in range(len(data1)):
    data[data1[i][0]] = [data1[i][j] for j in range(1, len(data1[i]))]
data = pd.DataFrame(data)
print("Data: ")
print(data)  #Dados Retirandos do arquivo
reader = BIFReader('asia.bif')  # melhor rede do asia, como esta no bnlearn.com
asia_model = reader.get_model()  # lendo esse modelo
print("Score BIC")
print(abs(BicScore(data).score(asia_model)))
#see_annealing(states, costs)
#print(dataPreparation.get_work_lists())
feature_names = dataPreparation.get_work_lists()
feature_names.append("Person")
print(feature_names)
#mydata = np.random.randint(low=0, high=2,size=(100, 6))
mydata = np.genfromtxt(
    r'E:\Lessons_tutorials\Behavioural user profile articles\Datasets\7 twor.2009\twor.2009\converted\pgmpy\activities+time_ordered_withoutdatetime.csv',
    delimiter=",")
#pd.read_csv(r'E:\Lessons_tutorials\Behavioural user profile articles\Datasets\7 twor.2009\twor.2009\converted\pgmpy\data.csv')
#print(mydata)
data = pd.DataFrame(mydata, columns=feature_names)  #['X', 'Y'])
print(data)

list_of_scoring_methods = [
    BicScore(data),
    #BdeuScore(data),
    #K2Score(data)
]

for scoreMethod in list_of_scoring_methods:
    start_time = time.time()
    hc = HillClimbSearch(data, scoreMethod)
    best_model = hc.estimate()
    print(hc.scoring_method)
    print(best_model.edges())
    end_time = time.time()
    print("execution time in seconds:")
    print(end_time - start_time)

estimator = BayesianEstimator(best_model, data)
    def learn_structure(self,
                        file_path,
                        algorithm="hc",
                        significance_level=0.05):
        """
        Employs `pgmpy` package's Bayesian Network structure learning algorithms to learn
        structure from a dataset. Saves a tabular version of the result as a CSV file.

        Arguments:
            algorithm: str, optional (default = 'hc')
                Determines whether the hill-climbing or Peter-Clark are employed.
                Two possible values include: 'hc', 'pc'. Note, I found a bug in pgmpy implementation
                halfway through this project. Don't use the 'pc' method.
            file_path: str, the absolute path to save the file to (e.g. "~/Desktop/BN_structure.csv")
            significance_level: float, option (default = 0.05)
                Statistical significance cutoff for use in pruning the network when using the PC
                algorithm. Lower values produce sparser networks.

        Returns:
            None
        """
        self.structure_algorithm = algorithm

        if self.verbose:
            print(
                "Depending on the number of variables in your dataset, this might take some time..."
            )

        # Learn structure, using one of the algorithms
        np.random.seed(self.random_seed)

        if algorithm == "hc":

            # Filter out columns with zero correlation with target variable
            self.filtered_df = self._initial_filter()

            # Run HC algorithm
            self.structure_model = HillClimbSearch(
                self.filtered_df,
                scoring_method=BicScore(self.filtered_df)).estimate()

            if self.verbose:
                print(
                    f"Structure learned! Saving structure to the following CSV: {file_path}"
                )

            # Eliminate isolated subgraphs
            G = self.structure_model.to_undirected()

            connected_nodes = list(
                nx.algorithms.components.node_connected_component(
                    G, self.target_variable))

            disconnected_nodes = list(
                set(list(self.structure_model.nodes)) - set(connected_nodes))

            for node in disconnected_nodes:
                self.structure_model.remove_node(node)
                self.filtered_df.drop([node], axis=1, inplace=True)

            pd.DataFrame(
                list(self.structure_model.edges),
                columns=["from_variable", "to_variable"],
            ).to_csv(file_path, index=False)

        elif algorithm == "pc":
            self.filtered_df = self.df
            self.structure_model = ConstraintBasedEstimator(
                self.filtered_df).estimate(
                    significance_level=significance_level)

            if self.verbose:
                print(
                    f"Structure learned! Saving structure to the following CSV: {file_path}"
                )

            pd.DataFrame(
                list(self.structure_model.edges),
                columns=["from_variable", "to_variable"],
            ).to_csv(file_path, index=False)
from pgmpy.estimators import HillClimbSearch, BicScore, BayesianEstimator
from pgmpy.models import BayesianModel
from pgmpy.readwrite.BIF import BIFWriter
import pandas as pd
import numpy as np
from time import time
import graphviz as gv
import os

train = pd.read_csv('../msnbcWithHeader.csv', sep=',')
train = train[train.sum(axis=1) < 200]
train[train > 1] = 1

train_start = time()
bic = BicScore(train)
hc = HillClimbSearch(train, scoring_method=bic)
best_model = hc.estimate(prog_bar=True)
edges = best_model.edges()
model = BayesianModel(edges)
model.fit(train, estimator=BayesianEstimator, prior_type="BDeu")
variables = model.nodes()

print(model.edges())
train_end = time() - train_start
print("train time " + str(train_end))

my_graph = gv.Digraph(format='png')
for node in variables:
    my_graph.node(node)
for edge in edges:
    my_graph.edge(edge[0], edge[1])
Beispiel #25
0
    def learn(self, file1, file2):
        f1 = open(file1, encoding="utf8")
        lines = f1.readlines()
        edges = self.getegdes(lines[0])
        data = pd.read_csv(file2)

        G = nx.DiGraph()
        for i in range(int(len(edges) / 2)):
            G.add_edge(edges[2 * i], edges[2 * i + 1])

        est = HillClimbSearch(data, scoring_method=BicScore(data))
        model = est.estimate()
        G_ = nx.DiGraph()
        G_.add_edges_from(model.edges())

        for i, j in G_.edges():
            if i not in G.nodes() or j not in G.nodes():
                G.add_edge(i, j)
            elif not nx.has_path(G, j, i):
                G.add_edge(i, j)

        new_model = BayesianModel()
        new_model.add_edges_from(G.edges)
        G = new_model.copy()

        # N = G.number_of_nodes()
        # B = np.zeros((N*(N-1)//2, N))
        # i = 0
        # y = []
        # k = 0
        # nodes = list(G.nodes._nodes.keys())
        # for i in range(len(nodes)):
        #     for j in range(i+1, len(nodes)):
        #         if nx.has_path(G, nodes[i], nodes[j]):
        #             y.append(1)
        #             B[k, i] = 1
        #             B[k, j] = -1
        #         elif nx.has_path(G, nodes[j], nodes[i]):
        #             y.append(-1)
        #             B[k, i] = 1
        #             B[k, j] = -1
        #         else:
        #             y.append(0)
        #         k += 1
        #
        # W = np.eye(N, N)
        # est = HillClimbSearch(data, scoring_method=BicScore(data))
        # model = est.estimate()
        # G_ = nx.DiGraph()
        # G_.add_edges_from(model.edges())
        # queue = []
        # for node in G_.nodes():
        #     if G_.in_degree(node) == 0:
        #         queue.append(node)
        #         G.node[node]['s'] = N
        #     else:
        #         G.node[node]['s'] = N//2
        # while len(queue)>0:
        #     now = queue[0]
        #     l = list(G_._succ[now].keys())
        #     for i in l:
        #         G.node[i]['s'] = G.node[now]['s'] - 1
        #     queue += l
        #     queue.pop(0)
        #
        # phai = []
        # for node in G.nodes():
        #     phai.append(G.node[node]['s'])
        # miu1 = np.dot(np.transpose(B), B)
        # miu1 = np.linalg.pinv(miu1)
        # miu2 = np.dot(np.transpose(B), y)
        # miu2 = miu2 + phai
        # miu = np.dot(miu1, miu2)
        #
        # seq = miu.tolist()
        # seq = list(zip(seq, nodes))
        # seq = sorted(seq, key=lambda s: s[0])
        # seq = [x[1] for x in seq]

        # nx.draw(G)
        # plt.show()
        estimator = BayesianEstimator(G, data)

        edges = []
        for i in G.edges:
            edges.append(str(i))
        print(edges)
        for i in G.nodes:
            cpd = estimator.estimate_cpd(i, prior_type="K2")
            nodeName = i
            values = dict(data[i].value_counts())
            valueNum = len(values)
            CPT = np.transpose(cpd.values)
            # CPT = cpd.values
            sequence = cpd.variables[1::]
            card = []
            for x in sequence:
                s = len(dict(data[x].value_counts()))
                card.append(s)
            output = nodeName + '\t' + str(valueNum) + '\t' + str(
                CPT.tolist()) + '\t' + str(sequence) + '\t' + str(card)
            print(output)
Beispiel #26
0
import pandas as pd
from pgmpy.estimators import HillClimbSearch, ExhaustiveSearch
from pgmpy.estimators import BDeuScore, BicScore, K2Score
##结构学习
data = pd.read_csv('data.csv', encoding='gb18030')
df = pd.DataFrame(data)
bic = BicScore(df)
k2 = K2Score(df)
hc = HillClimbSearch(df, scoring_method=bic)
#hc = ExhaustiveSearch(df, k2)
model = hc.estimate()
for ee in model.edges():
    print(ee)



##参数学习
from pgmpy.models import BayesianModel
mod = BayesianModel(model.edges())
mod.fit(df)
for cpd in mod.get_cpds():
    print(cpd)

#print(mod.local_independencies('HA'))

##模型推理
from pgmpy.inference import VariableElimination, BeliefPropagation
cancer_infer = VariableElimination(mod)
q = cancer_infer.query(variables=['HA'])
print(q)
Beispiel #27
0
col_names = pd.read_csv('data/names.csv')  # 'data/names.csv'
data = pd.read_csv('data/breast-cancer-wisconsin.data',
                   names=col_names.columns)
data = data[data["bare_nuclei"] != '?']
data.set_index('id', inplace=True)  #stop the model from using id as a node

train, test = train_test_split(data, test_size=0.2, random_state=0)
Y_test = test['class']
test = test.drop(['class'], axis=1)

#convert labels to something that can be handled be sklearn's eval functions
labelencoder = LabelEncoder()
Y_test = labelencoder.fit_transform(Y_test.values.ravel())

### Greedy Structure Learning with Hill Climbing
hc = HillClimbSearch(data, scoring_method=BicScore(train))
hc_model = hc.estimate()

### Parameter Learning with Bayesian Estimation
hc_model.fit(train, estimator=BayesianEstimator, prior_type="BDeu")
### If the following for loop is un-commented the terminal will be flooded with CPDs
"""
for cpd in best_model.get_cpds():
    print(cpd)
"""

print()

### Another Method (it will throw errors about sample size - but it still runs and shouldn't be too messed up)
###Constraint Based Structure Learning
est = ConstraintBasedEstimator(train)
Beispiel #28
0
 def __init__(self, dataframe):
     self.estimator = BicScore(dataframe)
     self.hashed_local_scores = {}
Beispiel #29
0
model.add_nodes_from(data.columns.values)

# Learn temporal relations from data
model.learn_temporal_relationships(data)

# Delete columns with temporal information
data.fillna(0, inplace=True)
for col in list(data.columns.values):
    if col.endswith(ITBN.start_time_marker) or col.endswith(
            ITBN.end_time_marker):
        data.drop(col, axis=1, inplace=True)
    elif not col.startswith(ITBN.temporal_node_marker):
        data[col] = data[col].map({1: 'Y', -1: 'N'})

# Learn model structure from data and temporal relations
hc = HillClimbSearchITBN(data, scoring_method=BicScore(data))
model = hc.estimate(start=model)
# model.add_edge('response', 'command')
# model.add_edge('response', 'tm_response_command')
# model.add_edge('command', 'tm_response_command')

# Learn model parameters
model.fit(data)

# Add observation nodes and cpds
obs_edges = list()
obs_cpds = list()
state_names = {
    'command': ['N', 'Y'],
    'prompt': ['N', 'Y'],
    'reward': ['N', 'Y'],
Beispiel #30
0
class MDL_Scorer:
    def __init__(self, dataframe):
        self.estimator = BicScore(dataframe)
        self.hashed_local_scores = {}

    def local_score(self, node_name, parent_names):
        key = node_name + str(parent_names)
        if key not in self.hashed_local_scores:
            score = abs(self.estimator.local_score(node_name, parent_names))
            self.hashed_local_scores[key] = score

        return self.hashed_local_scores[key]

    def score(self, network, verbose=0):
        total = 0
        if verbose > 2:
            print("starting scoring")
        for i in range(network.num_nodes()):
            if verbose > 3:
                print("node", i)

            parents = network.get_parents(i)  # get parents

            node_name = network.node_names(i)
            parent_names = network.node_names(parents)
            # print("node", node_name, "parents", parent_names)
            if verbose > 3:
                print("starting local score")
            local_score_ = self.local_score(node_name, parent_names)
            if verbose > 3:
                print("ended local score")
            # print("node", node_name, "parents",
            #      parent_names, "local score", local_score)
            total += local_score_
        if verbose > 2:
            print("ended scoring")

        return total

    # this performance can be improved
    def n_lowest_score(self,
                       n,
                       networks,
                       score_history_list,
                       network_history_list,
                       verbose=False):
        networks_sorted = networks.copy()
        scores = []
        for network in networks:
            score_ = self.score(network, verbose=verbose)
            scores.append(score_)

        x, y = sort_together([scores, networks_sorted])
        score_history_list += list(x)
        network_history_list += list(y)

        y = list(y[:n])
        return y

    def lowest_score(self, networks, verbose=False):
        result = {
            'best_index': -1,
            'best_score': float('inf'),
            'best_network': None
        }

        for i in range(len(networks)):
            network = networks[i]
            local = self.score(network)

            if local < result['best_score']:
                result['best_index'] = i
                result['best_score'] = local
                result['best_network'] = networks[i]

        return result