def train_joke_type_selection(): #one table jokes = Jokes.query.all() joke_preferences = [] for i in range( sum([joke_preference.nerd_joke for joke_preference in jokes])): joke_preferences.append("nerd joke") for i in range(sum([joke.weird_joke for joke in jokes])): joke_preferences.append("weird joke") for i in range(sum([joke.cat_meme for joke in jokes])): joke_preferences.append("cat meme") for i in range(sum([joke.dog_meme for joke in jokes])): joke_preferences.append("dog meme") for i in range(sum([joke.dad_joke for joke in jokes])): joke_preferences.append("dad joke") data = pd.DataFrame() for joke_preference in joke_preferences: data = data.append({"joke_preference": joke_preference}, ignore_index=True) bic = BicScore(data) import code code.interact(local=locals()) es = ExhaustiveSearch(data, scoring_method=bic) best_model = es.estimate() return best_model
def _exhaustivesearch(df, scoretype='bic', return_all_dags=False, verbose=3): out = dict() # Set scoring type scoring_method = _SetScoringType(df, scoretype) # Exhaustive search across all dags model = ExhaustiveSearch(df, scoring_method=scoring_method) # Compute best DAG best_model = model.estimate() # Store out['model'] = best_model out['model_edges'] = best_model.edges() # Compute all possible DAGs if return_all_dags: out['scores'] = [] out['dag'] = [] # print("\nAll DAGs by score:") for [score, dag] in reversed(model.all_scores()): out['scores'].append(score) out['dag'].append(dag) # print(score, dag.edges()) plt.plot(out['scores']) plt.show() return (out)
def setUp(self): self.rand_data = pd.DataFrame(np.random.randint(0, 5, size=(5000, 2)), columns=list('AB')) self.rand_data['C'] = self.rand_data['B'] self.est_rand = ExhaustiveSearch(self.rand_data) self.est_rand_bdeu = ExhaustiveSearch(self.rand_data, scoring_method=BdeuScore(self.rand_data)) self.est_rand_bic = ExhaustiveSearch(self.rand_data, scoring_method=BicScore(self.rand_data)) # link to dataset: "https://www.kaggle.com/c/titanic/download/train.csv" self.titanic_data = pd.read_csv('pgmpy/tests/test_estimators/testdata/titanic_train.csv') self.titanic_data2 = self.titanic_data[["Survived", "Sex", "Pclass"]] self.est_titanic = ExhaustiveSearch(self.titanic_data2)
def predict(self, dataset: DatasetInterface) -> List[Relation]: data = dataset.get_data() if self.algorithm == self.ESTIMATOR_PC: estimator = PC(data) graph = estimator.estimate(show_progress=False) elif self.algorithm == self.ESTIMATOR_MMHC: estimator = ExhaustiveSearch(data, show_progress=False) graph = estimator.estimate() else: estimator = HillClimbSearch(data) graph = estimator.estimate(show_progress=False) return PgmpyScript.__build_relations(graph, data)
def scoreStructureLearn(data, search='HillClimbSearch', scoring_method='BicScore'): #基于score-search的结构学习 #search:HillClimbSearch, ExhaustiveSearch #scoring_method: 'BicScore', K2Score, BdeuScore if scoring_method == 'BicScore': scoring_method_tmp = BicScore(data) elif scoring_method == 'K2Score': scoring_method_tmp = K2Score(data) elif scoring_method == 'BdeuScore': scoring_method_tmp = BdeuScore(data, equivalent_sample_size=5) if search == 'HillClimbSearch': es = HillClimbSearch(data, scoring_method=scoring_method_tmp) else: es = ExhaustiveSearch(data, scoring_method=scoring_method_tmp) best_model = es.estimate() return best_model
def structure_prediction(samples, encoder, bayesmodel_true, method='hc', score_type='bic'): Score = {'bic': BicScore, 'k2': K2Score, 'bdeu': BDeuScore}[score_type] samples = encoder.inverse_transform(samples) nodes = list(bayesmodel_true.nodes()) idx_to_node = dict(list(zip(range(len(nodes)), nodes))) samples = pd.DataFrame(samples).rename(columns=idx_to_node) if method == 'ex': bayesmodel_predicted = ExhaustiveSearch( samples, scoring_method=Score(samples)).estimate() else: bayesmodel_predicted = HillClimbSearch( samples, scoring_method=Score(samples)).estimate( start_dag=bayesmodel_true.copy(), show_progress=False) return bayesmodel_predicted
def setUp(self): self.rand_data = pd.DataFrame(np.random.randint(0, 5, size=(5000, 2)), columns=list('AB')) self.rand_data['C'] = self.rand_data['B'] self.est_rand = ExhaustiveSearch(self.rand_data) self.est_rand_bdeu = ExhaustiveSearch(self.rand_data, scoring_method=BdeuScore( self.rand_data)) self.est_rand_bic = ExhaustiveSearch(self.rand_data, scoring_method=BicScore( self.rand_data)) # link to dataset: "https://www.kaggle.com/c/titanic/download/train.csv" self.titanic_data = pd.read_csv( 'pgmpy/tests/test_estimators/testdata/titanic_train.csv') self.titanic_data2 = self.titanic_data[["Survived", "Sex", "Pclass"]] self.est_titanic = ExhaustiveSearch(self.titanic_data2)
# for this assignement. raw_data2 = { 'age': data['age'], 'avg_cs': data['avg_cs'], 'avg_mat': data['avg_mat'], 'delay': data['delay'], # Don't comment out this one 'gender': data['gender'], } data2 = pd.DataFrame(data=raw_data2) import time t0 = time.time() # Uncomment below to perform exhaustive search searcher = ExhaustiveSearch(data2, scoring_method=K2Score(data2)) search = searcher.all_scores() print('time:', time.time() - t0) # Uncomment for printout: #for score, model in search: # print("{0} {1}".format(score, model.edges())) separator() hcs = HillClimbSearch(data2, scoring_method=K2Score(data)) model = hcs.estimate() hcs2 = HillClimbSearch(data2, scoring_method=K2Score(data2)) model2 = hcs2.estimate()
class TestBaseEstimator(unittest.TestCase): def setUp(self): self.rand_data = pd.DataFrame(np.random.randint(0, 5, size=(5000, 2)), columns=list('AB')) self.rand_data['C'] = self.rand_data['B'] self.est_rand = ExhaustiveSearch(self.rand_data) self.est_rand_bdeu = ExhaustiveSearch(self.rand_data, scoring_method=BdeuScore( self.rand_data)) self.est_rand_bic = ExhaustiveSearch(self.rand_data, scoring_method=BicScore( self.rand_data)) # link to dataset: "https://www.kaggle.com/c/titanic/download/train.csv" self.titanic_data = pd.read_csv( 'pgmpy/tests/test_estimators/testdata/titanic_train.csv') self.titanic_data2 = self.titanic_data[["Survived", "Sex", "Pclass"]] self.est_titanic = ExhaustiveSearch(self.titanic_data2) def test_all_dags(self): self.assertEqual( len(list(self.est_rand.all_dags(['A', 'B', 'C', 'D']))), 543) # self.assertEqual(len(list(self.est_rand.all_dags(nodes=range(5)))), 29281) # takes ~30s abc_dags = set( six.moves.map( tuple, [sorted(dag.edges()) for dag in self.est_rand.all_dags()])) abc_dags_ref = set([ (('A', 'B'), ('C', 'A'), ('C', 'B')), (('A', 'C'), ('B', 'C')), (('B', 'A'), ('B', 'C')), (('C', 'B'), ), (('A', 'C'), ('B', 'A')), (('B', 'C'), ('C', 'A')), (('A', 'B'), ('B', 'C')), (('A', 'C'), ('B', 'A'), ('B', 'C')), (('A', 'B'), ), (('A', 'B'), ('C', 'A')), (('B', 'A'), ('C', 'A'), ('C', 'B')), (('A', 'C'), ('C', 'B')), (('A', 'B'), ('A', 'C'), ('C', 'B')), (('B', 'A'), ('C', 'B')), (('A', 'B'), ('A', 'C')), (('C', 'A'), ('C', 'B')), (('A', 'B'), ('A', 'C'), ('B', 'C')), (('C', 'A'), ), (('B', 'A'), ('B', 'C'), ('C', 'A')), (('B', 'A'), ), (('A', 'B'), ('C', 'B')), (), (('B', 'A'), ('C', 'A')), (('A', 'C'), ), (('B', 'C'), ) ]) self.assertSetEqual(abc_dags, abc_dags_ref) def test_estimate_rand(self): est = self.est_rand.estimate() self.assertSetEqual(set(est.nodes()), set(['A', 'B', 'C'])) self.assertTrue(est.edges() == [('B', 'C')] or est.edges() == [('C', 'B')]) est_bdeu = self.est_rand.estimate() self.assertTrue(est_bdeu.edges() == [('B', 'C')] or est_bdeu.edges() == [('C', 'B')]) est_bic = self.est_rand.estimate() self.assertTrue(est_bic.edges() == [('B', 'C')] or est_bic.edges() == [('C', 'B')]) def test_estimate_titanic(self): e1 = self.est_titanic.estimate() self.assertSetEqual( set(e1.edges()), set([('Survived', 'Pclass'), ('Sex', 'Pclass'), ('Sex', 'Survived')])) def test_all_scores(self): scores = self.est_titanic.all_scores() scores_ref = [(-2072.9132364404695, []), (-2069.071694164769, [('Pclass', 'Sex')]), (-2069.0144197068785, [('Sex', 'Pclass')]), (-2025.869489762676, [('Survived', 'Pclass')]), (-2025.8559302273054, [('Pclass', 'Survived')]), (-2022.0279474869753, [('Pclass', 'Sex'), ('Survived', 'Pclass')]), (-2022.0143879516047, [('Pclass', 'Sex'), ('Pclass', 'Survived')]), (-2021.9571134937144, [('Pclass', 'Survived'), ('Sex', 'Pclass')]), (-2017.5258065853768, [('Sex', 'Pclass'), ('Survived', 'Pclass')]), (-1941.3075053892837, [('Survived', 'Sex')]), (-1941.2720031713893, [('Sex', 'Survived')]), (-1937.4304608956886, [('Pclass', 'Sex'), ('Sex', 'Survived')]), (-1937.4086886556927, [('Sex', 'Pclass'), ('Survived', 'Sex')]), (-1937.3731864377983, [('Sex', 'Pclass'), ('Sex', 'Survived')]), (-1934.1344850608882, [('Pclass', 'Sex'), ('Survived', 'Sex')]), (-1894.2637587114903, [('Survived', 'Pclass'), ('Survived', 'Sex')]), (-1894.2501991761198, [('Pclass', 'Survived'), ('Survived', 'Sex')]), (-1894.2282564935958, [('Sex', 'Survived'), ('Survived', 'Pclass')]), (-1891.0630673606006, [('Pclass', 'Survived'), ('Sex', 'Survived')]), (-1887.2215250849, [('Pclass', 'Sex'), ('Pclass', 'Survived'), ('Sex', 'Survived')]), (-1887.1642506270096, [('Pclass', 'Survived'), ('Sex', 'Pclass'), ('Sex', 'Survived')]), (-1887.0907383830947, [('Pclass', 'Sex'), ('Survived', 'Pclass'), ('Survived', 'Sex')]), (-1887.0771788477243, [('Pclass', 'Sex'), ('Pclass', 'Survived'), ('Survived', 'Sex')]), (-1885.9200755341915, [('Sex', 'Pclass'), ('Survived', 'Pclass'), ('Survived', 'Sex')]), (-1885.884573316297, [('Sex', 'Pclass'), ('Sex', 'Survived'), ('Survived', 'Pclass')])] self.assertEqual([sorted(model.edges()) for score, model in scores], [edges for score, edges in scores_ref]) # use assertAlmostEqual pointwise to avoid rounding issues six.moves.map(lambda x, y: self.assertAlmostEqual(x, y), [score for score, model in scores], [score for score, edges in scores_ref]) def tearDown(self): del self.rand_data del self.est_rand del self.est_rand_bdeu del self.est_rand_bic del self.titanic_data del self.est_titanic
class TestBaseEstimator(unittest.TestCase): def setUp(self): self.rand_data = pd.DataFrame(np.random.randint(0, 5, size=(5000, 2)), columns=list('AB')) self.rand_data['C'] = self.rand_data['B'] self.est_rand = ExhaustiveSearch(self.rand_data) self.est_rand_bdeu = ExhaustiveSearch(self.rand_data, scoring_method=BdeuScore(self.rand_data)) self.est_rand_bic = ExhaustiveSearch(self.rand_data, scoring_method=BicScore(self.rand_data)) # link to dataset: "https://www.kaggle.com/c/titanic/download/train.csv" self.titanic_data = pd.read_csv('pgmpy/tests/test_estimators/testdata/titanic_train.csv') self.titanic_data2 = self.titanic_data[["Survived", "Sex", "Pclass"]] self.est_titanic = ExhaustiveSearch(self.titanic_data2) def test_all_dags(self): self.assertEqual(len(list(self.est_rand.all_dags(['A', 'B', 'C', 'D']))), 543) # self.assertEqual(len(list(self.est_rand.all_dags(nodes=range(5)))), 29281) # takes ~30s abc_dags = set(six.moves.map(tuple, [sorted(dag.edges()) for dag in self.est_rand.all_dags()])) abc_dags_ref = set([(('A', 'B'), ('C', 'A'), ('C', 'B')), (('A', 'C'), ('B', 'C')), (('B', 'A'), ('B', 'C')), (('C', 'B'),), (('A', 'C'), ('B', 'A')), (('B', 'C'), ('C', 'A')), (('A', 'B'), ('B', 'C')), (('A', 'C'), ('B', 'A'), ('B', 'C')), (('A', 'B'),), (('A', 'B'), ('C', 'A')), (('B', 'A'), ('C', 'A'), ('C', 'B')), (('A', 'C'), ('C', 'B')), (('A', 'B'), ('A', 'C'), ('C', 'B')), (('B', 'A'), ('C', 'B')), (('A', 'B'), ('A', 'C')), (('C', 'A'), ('C', 'B')), (('A', 'B'), ('A', 'C'), ('B', 'C')), (('C', 'A'),), (('B', 'A'), ('B', 'C'), ('C', 'A')), (('B', 'A'),), (('A', 'B'), ('C', 'B')), (), (('B', 'A'), ('C', 'A')), (('A', 'C'),), (('B', 'C'),)]) self.assertSetEqual(abc_dags, abc_dags_ref) def test_estimate_rand(self): est = self.est_rand.estimate() self.assertSetEqual(set(est.nodes()), set(['A', 'B', 'C'])) self.assertTrue(est.edges() == [('B', 'C')] or est.edges() == [('C', 'B')]) est_bdeu = self.est_rand.estimate() self.assertTrue(est_bdeu.edges() == [('B', 'C')] or est_bdeu.edges() == [('C', 'B')]) est_bic = self.est_rand.estimate() self.assertTrue(est_bic.edges() == [('B', 'C')] or est_bic.edges() == [('C', 'B')]) def test_estimate_titanic(self): e1 = self.est_titanic.estimate() self.assertSetEqual(set(e1.edges()), set([('Survived', 'Pclass'), ('Sex', 'Pclass'), ('Sex', 'Survived')])) def test_all_scores(self): scores = self.est_titanic.all_scores() scores_ref = [(-2072.9132364404695, []), (-2069.071694164769, [('Pclass', 'Sex')]), (-2069.0144197068785, [('Sex', 'Pclass')]), (-2025.869489762676, [('Survived', 'Pclass')]), (-2025.8559302273054, [('Pclass', 'Survived')]), (-2022.0279474869753, [('Pclass', 'Sex'), ('Survived', 'Pclass')]), (-2022.0143879516047, [('Pclass', 'Sex'), ('Pclass', 'Survived')]), (-2021.9571134937144, [('Pclass', 'Survived'), ('Sex', 'Pclass')]), (-2017.5258065853768, [('Sex', 'Pclass'), ('Survived', 'Pclass')]), (-1941.3075053892837, [('Survived', 'Sex')]), (-1941.2720031713893, [('Sex', 'Survived')]), (-1937.4304608956886, [('Pclass', 'Sex'), ('Sex', 'Survived')]), (-1937.4086886556927, [('Sex', 'Pclass'), ('Survived', 'Sex')]), (-1937.3731864377983, [('Sex', 'Pclass'), ('Sex', 'Survived')]), (-1934.1344850608882, [('Pclass', 'Sex'), ('Survived', 'Sex')]), (-1894.2637587114903, [('Survived', 'Pclass'), ('Survived', 'Sex')]), (-1894.2501991761198, [('Pclass', 'Survived'), ('Survived', 'Sex')]), (-1894.2282564935958, [('Sex', 'Survived'), ('Survived', 'Pclass')]), (-1891.0630673606006, [('Pclass', 'Survived'), ('Sex', 'Survived')]), (-1887.2215250849, [('Pclass', 'Sex'), ('Pclass', 'Survived'), ('Sex', 'Survived')]), (-1887.1642506270096, [('Pclass', 'Survived'), ('Sex', 'Pclass'), ('Sex', 'Survived')]), (-1887.0907383830947, [('Pclass', 'Sex'), ('Survived', 'Pclass'), ('Survived', 'Sex')]), (-1887.0771788477243, [('Pclass', 'Sex'), ('Pclass', 'Survived'), ('Survived', 'Sex')]), (-1885.9200755341915, [('Sex', 'Pclass'), ('Survived', 'Pclass'), ('Survived', 'Sex')]), (-1885.884573316297, [('Sex', 'Pclass'), ('Sex', 'Survived'), ('Survived', 'Pclass')])] self.assertEqual([sorted(model.edges()) for score, model in scores], [edges for score, edges in scores_ref]) # use assertAlmostEqual pointwise to avoid rounding issues six.moves.map(lambda x, y: self.assertAlmostEqual(x, y), [score for score, model in scores], [score for score, edges in scores_ref]) def tearDown(self): del self.rand_data del self.est_rand del self.est_rand_bdeu del self.est_rand_bic del self.titanic_data del self.est_titanic
from pgmpy.estimators import ExhaustiveSearch, K2Score if __name__ == '__main__': # fp = os.path.join('data', 'MTurk_Harvey.csv') # df = pd.read_csv(fp) # data = np.genfromtxt(fp, delimiter=",", dtype=float, skip_header=1) # x = data[:,:-1] # y = data[:,-1] # data = pd.DataFrame(np.random.randint(0, 5, size=(2500, 3)), columns=list('XYZ')) # data['sum'] = data.sum(axis=1) # #print(data) # est = ConstraintBasedEstimator(data) # skel, sep_sets = est.estimate_skeleton() # print(skel.edges()) # s = ExhaustiveSearch(pd.DataFrame(data={'Temperature': [23, 19],'Weather': ['sunny', 'cloudy'],'Humidity': [65, 75]})) # print(len(list(s.all_dags()))) # for dag in s.all_dags(): # print(dag.edges()) data = pd.DataFrame(np.random.randint(0, 5, size=(5000, 2)), columns=list('AB')) data['C'] = data['B'] searcher = ExhaustiveSearch(data, scoring_method=K2Score(data)) for score, model in searcher.all_scores(): print score print model.edges()
print(k2Fruit.local_score(variable='tasty', parents=['size', 'fruit'])) print(bicFruit.local_score(variable='tasty', parents=['size', 'fruit'])) # %% markdown [markdown] # ### Search Strategies # The search space of DAGs is super-exponential in the number of variables and the above scoring functions allow for local maxima. The first property makes exhaustive search intractable for all but very small networks, the second prohibits efficient local optimization algorithms to always find the optimal structure. Thus, identifiying the ideal structure is often not tractable. Despite these bad news, heuristic search strategies often yields good results. # # If only few nodes are involved (read: less than 5), ExhaustiveSearch can be used to compute the score for every DAG and returns the best-scoring one: # #### Exhaustive Search # **Example 1:** $Z + X + Y$ # %% codecell from pgmpy.estimators import ExhaustiveSearch from pgmpy.base.DAG import DAG es: ExhaustiveSearch = ExhaustiveSearch(data=data, scoring_method=bic) bestModel: DAG = es.estimate() bestModel.edges() # %% codecell # The best model (structurally estimated): drawGraph(bestModel, nodeColor=LIGHT_GREEN) # %% codecell # Computing scores for all structurally analyzed DAGS: print("All DAGs sorted by score:\n") for score, dag in reversed(es.all_scores()): print(f"Score = {score}, Edges: {dag.edges()}") # %% markdown [markdown]