def buildBN(trainingData, binstyleDict, numbinsDict, **kwargs): # need to modify to accept skel or skelfile discretized_training_data, bin_ranges = discretizeTrainingData( trainingData, binstyleDict, numbinsDict, True) print 'discret training ', discretized_training_data if 'skel' in kwargs: # load file into skeleton if isinstance(kwargs['skel'], basestring): skel = GraphSkeleton() skel.load(kwargs['skel']) skel.toporder() else: skel = kwargs['skel'] # learn bayesian network learner = PGMLearner() # baynet = learner.discrete_mle_estimateparams(skel, discretized_training_data) # baynet = discrete_estimatebn(learner, discretized_training_data, skel, 0.05, 1) baynet = discrete_mle_estimateparams2( skel, discretized_training_data ) # using discrete_mle_estimateparams2 written as function in this file, not calling from libpgm return baynet
def estimate_distrib(skel, samples, query, evidence): learner = PGMLearner() bayesnet = learner.discrete_mle_estimateparams(skel, samples) tablecpd = TableCPDFactorization(bayesnet) fac = tablecpd.condprobve(query, evidence) df2 = printdist(fac, bayesnet) return df2
def test_libpgm(df1): data = df1.T.to_dict().values() #pprint(data) skel = GraphSkeleton() skel.load("bn_struct.txt") learner = PGMLearner() result = learner.discrete_mle_estimateparams(skel, data) print json.dumps(result.Vdata, indent=2)
def learnBN(fdata_array, bn_file): bn_path = os.path.join(experiment_dir, 'parameters', bn_file + '.txt') skel = GraphSkeleton() skel.load(bn_path) skel.toporder() learner = PGMLearner() bn = learner.discrete_mle_estimateparams(skel, fdata_array) return bn
def getBNparams(graph, ddata, n): # Gets Disc. BN parameters given a graph skeleton #skeleton should include t-1 and t nodes for each variable nodes = range(1, (n * 2) + 1) nodes = map(str, nodes) edges = gk.edgelist(graph) for i in range(len(edges)): edges[i] = list([edges[i][0], str(n + int(edges[i][1]))]) skel = GraphSkeleton() skel.V = nodes skel.E = edges learner = PGMLearner() result = learner.discrete_mle_estimateparams(skel, ddata) return result
def __init__(self): self.learner = PGMLearner() rospy.Service("~discrete/parameter_estimation", DiscreteParameterEstimation, self.discrete_parameter_estimation_cb) rospy.Service("~discrete/query", DiscreteQuery, self.discrete_query_cb) rospy.Service("~discrete/structure_estimation", DiscreteStructureEstimation, self.discrete_structure_estimation_cb) rospy.Service("~linear_gaussian/parameter_estimation", LinearGaussianParameterEstimation, self.lg_parameter_estimation_cb) rospy.Service("~linear_gaussian/structure_estimation", LinearGaussianStructureEstimation, self.lg_structure_estimation_cb)
def em(data, bn, skel): lk_last = 100 times = 0 while 1: d2 = data_with_hidden(data, bn) learner = PGMLearner() #toolbox bn = learner.discrete_mle_estimateparams(skel, d2) #toolbox lk = likelihood(d2, bn) print "LogLikelihood:", lk times += 1 if abs((lk - lk_last) / lk_last) < 0.001: break lk_last = lk print times return bn
def net2(): nd = NodeData() skel = GraphSkeleton() nd.load("net.txt") # an input file skel.load("net.txt") # topologically order graphskeleton skel.toporder() # load bayesian network lgbn = LGBayesianNetwork(skel, nd) in_data = read_data.getdata2() learner = PGMLearner() bn = learner.lg_mle_estimateparams(skel, in_data) p = cal_prob(in_data[300:500], bn) print p return 0
def bayesNet(textFile): cleanText(textFile, 'tempOutput.txt') ## imports textFile into pandas try: df = pd.read_csv('tempOutput.txt', sep='\s+', dtype='float32', header=None) except: print 'next file' return df.fillna(0, inplace=True) df.convert_objects(convert_numeric=True) ## for i, row in df.iterrows(): print df.ix[0, i] df.ix[0, i] = df.ix[0, i] + str(i) grouped = df.set_index([0], verify_integrity=True) df2 = grouped.to_dict() print json.dumps(df2, indent=2) newDict = [] for key in df2.keys(): newDict.append(df2[key]) #print json.dumps(newDict, indent=2) # instantiate my learner learner = PGMLearner() # estimate structure result = learner.lg_constraint_estimatestruct(newDict) # output return json.dumps(result.E, indent=2)
def calc_accuracy(dff_train, dff_train_target, nb_iterations): result = np.zeros(nb_iterations) for itera in range(nb_iterations): XX_train, XX_test, yy_train, yy_test = train_test_split(dff_train, dff_train_target, test_size=0.33) data4bn = format_data(XX_train) learner = PGMLearner() # estimate parameters result_bn = learner.discrete_mle_estimateparams(skel, data4bn) #result_bn.Vdata result_predict = calc_BNprob(XX_test) BN_test_probs = pd.DataFrame() BN_test_probs['ground_truth'] = yy_test Test_prob = pd.concat([yy_test.reset_index().Surv, result_predict], axis = 1, ignore_index = True) .rename(columns = {0:'ground_truth' , 1:'class_resu'}) accuracy = Test_prob[Test_prob.ground_truth == Test_prob.class_resu].shape[0]/(1.0*Test_prob.shape[0]) #print("Accuracy is {}").format(accuracy) result[itera] = accuracy return result
def setUp(self): # instantiate learner self.l = PGMLearner() # generate graph skeleton skel = GraphSkeleton() skel.load("unittestdict.txt") skel.toporder() # generate sample sequence to try to learn from - discrete nd = NodeData.load("unittestdict.txt") self.samplediscbn = DiscreteBayesianNetwork(nd) self.samplediscseq = self.samplediscbn.randomsample(5000) # generate sample sequence to try to learn from - discrete nda = NodeData.load("unittestlgdict.txt") self.samplelgbn = LGBayesianNetwork(nda) self.samplelgseq = self.samplelgbn.randomsample(10000) self.skel = skel
def bayesNetDiscrete(textFile, quant_no, unique): cleanText(textFile, 'tempOutput.txt') ## imports textFile into pandas try: df = pd.read_csv('tempOutput.txt', sep='\s+', dtype='float64', header=None) except: print 'next file' return df.fillna(0, inplace=True) df.convert_objects(convert_numeric=True) ## set to either setUnique() or setMax() if unique is True: grouped = setUnique(df) else: grouped = setMax(df) ## quantiles is qcut(), fixed width divisions is cut grouped = quantize(quant_no, grouped) #turns into correct dictionary format for libpgm newDict = DFtoLibpgm(grouped) # instantiate my learner learner = PGMLearner() # estimate structure try: result = learner.discrete_estimatebn(newDict) except: print 'error' #result = learner.discrete_estimatebn([dict([('a',1),('b',2)])]) return # output return result
def bayesNetCont(textFile, unique): cleanText(textFile, 'tempOutput.txt') ## imports textFile into pandas try: df = pd.read_csv('tempOutput.txt', sep='\s+', dtype='float64', header=None) except: print 'next file' return df.fillna(0, inplace=True) df.convert_objects(convert_numeric=True) ## set to either setUnique() or setMax() if unique is True: grouped = setUnique(df) else: grouped = setMax(df) #turns into correct dictionary format for libpgm newDict = DFtoLibpgm(grouped) # instantiate my learner learner = PGMLearner() # estimate structure #gaussian try: result = learner.lg_constraint_estimatestruct(newDict) except: print 'error' return # output return result
def main(): # filename features_file = './../data/features.csv' # read data into list handwriting_features = postmaster.readCSVIntoListAsDict(features_file) # learn structure # instantiate learner learner = PGMLearner() pvalue = 0.25 indegree = 1 # estimate structure #result = learner.discrete_constraint_estimatestruct( # handwriting_features, pvalue, indegree) result = learner.discrete_estimatebn(handwriting_features) #result = learner.discrete_condind(handwriting_features, 'f1', 'f2', # ['f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9']) # output #print result.chi, result.pval, result.U #print json.dumps(result.E, indent=2) print json.dumps(result.Vdata, indent=2)
def fun(inputData): #Defining formatting data method def format_data(df): result = [] for row in df.itertuples(): #print(row.Pclass) result.append( dict(great=row.great, good=row.good, clean=row.clean, comfortable=row.comfortable, bad=row.bad, old=row.old, Cleanliness=row.Cleanliness, Location=row.Location, Service=row.Service, Rooms=row.Rooms, Value=row.Value, Overall=row.Overall)) return result #load all preprocessed training data df = pd.read_csv('features.csv', sep=',') #format data to let them correctly processed by libpgm functions node_data = format_data(df) skel = GraphSkeleton() #load structure of our net skel.load("./our-skel.txt") #setting the topologic order skel.toporder() #learner which will estimate parameters e if needed net structure learner = PGMLearner() #estismting parameters for our own model res = learner.discrete_mle_estimateparams(skel, node_data) # get CPT a = TableCPDFactorization(res) #compute the query and evidences as dicts query = dict(Overall=1) # prepare dictionary of values (dopo gli uguali devi mettere i valori che leggi dalla GUI) evidence = dict(Value=inputData[0], Location=inputData[1], Cleanliness=inputData[2], Service=inputData[3], Rooms=inputData[4], bad=inputData[5], old=inputData[6], good=inputData[7], great=inputData[8], comfortable=inputData[9], clean=inputData[10]) print(query) print(evidence) #run the query given evidence result = a.condprobve(query, evidence) print json.dumps(result.vals, indent=2) #res.Vdata["Overall"]["vals"][pos] #arr=[] dizionario = {} for i in range(1, 6): dizionario[res.Vdata["Overall"]["vals"][i - 1]] = result.vals[i - 1] # arr.append(dizionario) #print(str(arr)) return dizionario
def bn_learn(attr, cicli, passed_file): path_to_sentiments = 'sentiment_AFINN' print "Using AFINN sentiment dictionary" if attr == 0: print "Considering tweets' number" elif attr == 1: print "Considering averaged number of positive, negative and neutral tweets" elif attr == 2: print "Considering averaged value of positive and negative tweets" elif attr == 3: print "Considering positive and negative tweets\' increment" elif attr == 4: print "Considering bullisment index obtained by number of tweets sentiment" elif attr == 5: print "Considering bullisment index obtained by tweets value of sentiment" print "And considering market trend" all_data = [] files = [ path_to_sentiments + "/" + file for file in os.listdir(path_to_sentiments) if file.endswith('.json') ] for file in files: with open(file) as sentiment_file: data = json.load(sentiment_file) vdata = {} if attr == 0: vdata["com"] = data["n_tweets"] elif attr == 1: vdata["pos"] = data["n_pos_ave"] vdata["neg"] = data["n_neg_ave"] vdata["neu"] = data["n_neu_ave"] elif attr == 2: vdata["pos"] = data["pos_val_ave"] vdata["neg"] = data["neg_val_ave"] elif attr == 3: vdata["pos"] = data["pos_inc"] vdata["neg"] = data["neg_inc"] elif attr == 4: vdata["com"] = data["bull_ind"] elif attr == 5: vdata["com"] = data["bull_ind_val"] vdata["market"] = data["market_inc"] all_data.append(vdata) skel = GraphSkeleton() if len(all_data[0]) == 2: skel.load("network_struct_1_vertex.json") print "Loading structure with 2 node" elif len(all_data[0]) == 3: skel.load("network_struct_2_vertex.json") print "Loading structure with 3 node" elif len(all_data[0]) == 4: skel.load("network_struct_3_vertex.json") print "Loading structure with 4 node" skel.toporder() learner = PGMLearner() result = learner.lg_mle_estimateparams(skel, all_data) for key in result.Vdata.keys(): result.Vdata[key]['type'] = 'lg' prob_pos = prob_neg = prob_neu = 0 for data in all_data: if data['market'] == 1: prob_pos += 1 elif data['market'] == 0: prob_neu += 1 else: prob_neg += 1 prob_pos = float(prob_pos) / float(len(all_data)) prob_neg = float(prob_neg) / float(len(all_data)) prob_neu = float(prob_neu) / float(len(all_data)) tmp = {} tmp['numoutcomes'] = len(all_data) tmp['cprob'] = [prob_pos, prob_neg, prob_neu] tmp['parents'] = result.Vdata['market']['parents'] tmp['vals'] = ['positive', 'negative', 'neutral'] tmp['type'] = 'discrete' tmp['children'] = result.Vdata['market']['children'] result.Vdata['market'] = tmp node = Discrete(result.Vdata["market"]) print "Loading node as Discrete" estimated, real = mcmc_json(passed_file, attr, cicli, node) return estimated, real
[{'Class': 3, 'Fare': 0, 'Sex': 1, 'Surv': 0}, {'Class': 1, 'Fare': 1, 'Sex': 0, 'Surv': 1}, {'Class': 3, 'Fare': 0, 'Sex': 0, 'Surv': 1}, {'Class': 1, 'Fare': 1, 'Sex': 0, 'Surv': 1},...] # In[ ]: nd = NodeData() skel = GraphSkeleton() #The structure is defined in the file titanic_skel jsonpath ="titanic_skel.json" skel.load(jsonpath) #instatiate the learner learner = PGMLearner() # The methos estimates the parameters for a discrete Bayesian network with # a structure given by graphskeleton in order to maximize the probability # of data given by data result_params = learner.discrete_mle_estimateparams(skel, training_data) result_params.Vdata['Class']# to inspect the network # Check the prediction accuracy # In[ ]: #results = calc_accuracy(dff_train, dff_train_target, 100)
from libpgm.graphskeleton import GraphSkeleton from libpgm.tablecpdfactorization import TableCPDFactorization from libpgm.pgmlearner import PGMLearner text = open("../unifiedMLData2.json") data = text.read() printable = set(string.printable) asciiData = filter(lambda x: x in printable, data) #listofDicts=json.dumps(data) listofDicts = json.loads(asciiData) skel = GraphSkeleton() skel.load("../skeleton.json") learner = PGMLearner() result = learner.discrete_mle_estimateparams(skel, listofDicts) tcf = TableCPDFactorization(result) #Rating 1 Given Occupation is student myquery = dict(rating=[1]) myevidence = dict(occupation='student') result = tcf.specificquery(query=myquery, evidence=myevidence) print result tcf.refresh() #Rating 2 Given Occupation is student myquery = dict(rating=[2])
def learnDiscreteBN_with_structure(df, continous_columns, features_column_names, label_column='cat', draw_network=False): features_df = df.copy() features_df = features_df.drop(label_column, axis=1) labels_df = DataFrame() labels_df[label_column] = df[label_column].copy() for i in continous_columns: bins = np.arange((min(features_df[i])), (max(features_df[i])), ((max(features_df[i]) - min(features_df[i])) / 5.0)) features_df[i] = pandas.np.digitize(features_df[i], bins=bins) data = [] for index, row in features_df.iterrows(): dict = {} for i in features_column_names: dict[i] = row[i] dict[label_column] = labels_df[label_column][index] data.append(dict) print "Init done" learner = PGMLearner() graph = GraphSkeleton() graph.V = [] graph.E = [] graph.V.append(label_column) for vertice in features_column_names: graph.V.append(vertice) graph.E.append([vertice, label_column]) test = learner.discrete_mle_estimateparams(graphskeleton=graph, data=data) print "done learning" edges = test.E vertices = test.V probas = test.Vdata # print probas dot_string = 'digraph BN{\n' dot_string += 'node[fontname="Arial"];\n' dataframes = {} print "save data" for vertice in vertices: print "New vertice: " + str(vertice) dataframe = DataFrame() pp = pprint.PrettyPrinter(indent=4) # pp.pprint(probas[vertice]) dot_string += vertice.replace( " ", "_") + ' [label="' + vertice + '\n' + '" ]; \n' if len(probas[vertice]['parents']) == 0: dataframe['Outcome'] = None dataframe['Probability'] = None vertex_dict = {} for index_outcome, outcome in enumerate(probas[vertice]['vals']): vertex_dict[str( outcome)] = probas[vertice]["cprob"][index_outcome] od = collections.OrderedDict(sorted(vertex_dict.items())) # print "Vertice: " + str(vertice) # print "%-7s|%-11s" % ("Outcome", "Probability") # print "-------------------" for k, v in od.iteritems(): # print "%-7s|%-11s" % (str(k), str(round(v, 3))) dataframe.loc[len(dataframe)] = [k, v] dataframes[vertice] = dataframe else: # pp.pprint(probas[vertice]) dataframe['Outcome'] = None vertexen = {} for index_outcome, outcome in enumerate(probas[vertice]['vals']): temp = [] for parent_index, parent in enumerate( probas[vertice]["parents"]): # print str([str(float(index_outcome))]) temp = probas[vertice]["cprob"] dataframe[parent] = None vertexen[str(outcome)] = temp dataframe['Probability'] = None od = collections.OrderedDict(sorted(vertexen.items())) # [str(float(i)) for i in ast.literal_eval(key)] # str(v[key][int(float(k))-1]) # print "Vertice: " + str(vertice) + " with parents: " + str(probas[vertice]['parents']) # print "Outcome" + "\t\t" + '\t\t'.join(probas[vertice]['parents']) + "\t\tProbability" # print "------------" * len(probas[vertice]['parents']) *3 # pp.pprint(od.values()) counter = 0 # print number_of_cols for outcome, cprobs in od.iteritems(): for key in cprobs.keys(): array_frame = [] array_frame.append((outcome)) print_string = str(outcome) + "\t\t" for parent_value, parent in enumerate( [i for i in ast.literal_eval(key)]): # print "parent-value:"+str(parent_value) # print "parten:"+str(parent) array_frame.append(int(float(parent))) # print "lengte array_frame: "+str(len(array_frame)) print_string += parent + "\t\t" array_frame.append(cprobs[key][counter]) # print "lengte array_frame (2): "+str(len(array_frame)) # print cprobs[key][counter] print_string += str(cprobs[key][counter]) + "\t" # for stront in [str(round(float(i), 3)) for i in ast.literal_eval(key)]: # print_string += stront + "\t\t" # print "print string: " + print_string # print "array_frame:" + str(array_frame) dataframe.loc[len(dataframe)] = array_frame counter += 1 print "Vertice " + str(vertice) + " done" dataframes[vertice] = dataframe for edge in edges: dot_string += edge[0].replace(" ", "_") + ' -> ' + edge[1].replace( " ", "_") + ';\n' dot_string += '}' # src = Source(dot_string) # src.render('../data/BN', view=draw_network) # src.render('../data/BN', view=False) print "vizualisation done" return dataframes