def train_joke_type_selection():
    #one table
    jokes = Jokes.query.all()
    joke_preferences = []
    for i in range(
            sum([joke_preference.nerd_joke for joke_preference in jokes])):
        joke_preferences.append("nerd joke")
    for i in range(sum([joke.weird_joke for joke in jokes])):
        joke_preferences.append("weird joke")
    for i in range(sum([joke.cat_meme for joke in jokes])):
        joke_preferences.append("cat meme")
    for i in range(sum([joke.dog_meme for joke in jokes])):
        joke_preferences.append("dog meme")
    for i in range(sum([joke.dad_joke for joke in jokes])):
        joke_preferences.append("dad joke")
    data = pd.DataFrame()
    for joke_preference in joke_preferences:
        data = data.append({"joke_preference": joke_preference},
                           ignore_index=True)

    bic = BicScore(data)
    import code
    code.interact(local=locals())
    es = ExhaustiveSearch(data, scoring_method=bic)
    best_model = es.estimate()
    return best_model
Example #2
0
def _exhaustivesearch(df, scoretype='bic', return_all_dags=False, verbose=3):
    out = dict()

    # Set scoring type
    scoring_method = _SetScoringType(df, scoretype)
    # Exhaustive search across all dags
    model = ExhaustiveSearch(df, scoring_method=scoring_method)
    # Compute best DAG
    best_model = model.estimate()
    # Store
    out['model'] = best_model
    out['model_edges'] = best_model.edges()

    # Compute all possible DAGs
    if return_all_dags:
        out['scores'] = []
        out['dag'] = []
        # print("\nAll DAGs by score:")
        for [score, dag] in reversed(model.all_scores()):
            out['scores'].append(score)
            out['dag'].append(dag)
            # print(score, dag.edges())

        plt.plot(out['scores'])
        plt.show()

    return (out)
    def setUp(self):
        self.rand_data = pd.DataFrame(np.random.randint(0, 5, size=(5000, 2)), columns=list('AB'))
        self.rand_data['C'] = self.rand_data['B']
        self.est_rand = ExhaustiveSearch(self.rand_data)
        self.est_rand_bdeu = ExhaustiveSearch(self.rand_data, scoring_method=BdeuScore(self.rand_data))
        self.est_rand_bic = ExhaustiveSearch(self.rand_data, scoring_method=BicScore(self.rand_data))

        # link to dataset: "https://www.kaggle.com/c/titanic/download/train.csv"
        self.titanic_data = pd.read_csv('pgmpy/tests/test_estimators/testdata/titanic_train.csv')
        self.titanic_data2 = self.titanic_data[["Survived", "Sex", "Pclass"]]
        self.est_titanic = ExhaustiveSearch(self.titanic_data2)
    def predict(self, dataset: DatasetInterface) -> List[Relation]:
        data = dataset.get_data()

        if self.algorithm == self.ESTIMATOR_PC:
            estimator = PC(data)
            graph = estimator.estimate(show_progress=False)
        elif self.algorithm == self.ESTIMATOR_MMHC:
            estimator = ExhaustiveSearch(data, show_progress=False)
            graph = estimator.estimate()
        else:
            estimator = HillClimbSearch(data)
            graph = estimator.estimate(show_progress=False)

        return PgmpyScript.__build_relations(graph, data)
Example #5
0
def scoreStructureLearn(data,
                        search='HillClimbSearch',
                        scoring_method='BicScore'):
    #基于score-search的结构学习
    #search:HillClimbSearch, ExhaustiveSearch
    #scoring_method: 'BicScore', K2Score, BdeuScore
    if scoring_method == 'BicScore':
        scoring_method_tmp = BicScore(data)
    elif scoring_method == 'K2Score':
        scoring_method_tmp = K2Score(data)
    elif scoring_method == 'BdeuScore':
        scoring_method_tmp = BdeuScore(data, equivalent_sample_size=5)
    if search == 'HillClimbSearch':
        es = HillClimbSearch(data, scoring_method=scoring_method_tmp)
    else:
        es = ExhaustiveSearch(data, scoring_method=scoring_method_tmp)
    best_model = es.estimate()
    return best_model
def structure_prediction(samples,
                         encoder,
                         bayesmodel_true,
                         method='hc',
                         score_type='bic'):
    Score = {'bic': BicScore, 'k2': K2Score, 'bdeu': BDeuScore}[score_type]
    samples = encoder.inverse_transform(samples)
    nodes = list(bayesmodel_true.nodes())
    idx_to_node = dict(list(zip(range(len(nodes)), nodes)))
    samples = pd.DataFrame(samples).rename(columns=idx_to_node)
    if method == 'ex':
        bayesmodel_predicted = ExhaustiveSearch(
            samples, scoring_method=Score(samples)).estimate()
    else:
        bayesmodel_predicted = HillClimbSearch(
            samples, scoring_method=Score(samples)).estimate(
                start_dag=bayesmodel_true.copy(), show_progress=False)
    return bayesmodel_predicted
    def setUp(self):
        self.rand_data = pd.DataFrame(np.random.randint(0, 5, size=(5000, 2)),
                                      columns=list('AB'))
        self.rand_data['C'] = self.rand_data['B']
        self.est_rand = ExhaustiveSearch(self.rand_data)
        self.est_rand_bdeu = ExhaustiveSearch(self.rand_data,
                                              scoring_method=BdeuScore(
                                                  self.rand_data))
        self.est_rand_bic = ExhaustiveSearch(self.rand_data,
                                             scoring_method=BicScore(
                                                 self.rand_data))

        # link to dataset: "https://www.kaggle.com/c/titanic/download/train.csv"
        self.titanic_data = pd.read_csv(
            'pgmpy/tests/test_estimators/testdata/titanic_train.csv')
        self.titanic_data2 = self.titanic_data[["Survived", "Sex", "Pclass"]]
        self.est_titanic = ExhaustiveSearch(self.titanic_data2)
# for this assignement.
raw_data2 = {
    'age': data['age'],
    'avg_cs': data['avg_cs'],
    'avg_mat': data['avg_mat'],
    'delay': data['delay'],  # Don't comment out this one
    'gender': data['gender'],
}

data2 = pd.DataFrame(data=raw_data2)

import time

t0 = time.time()
# Uncomment below to perform exhaustive search
searcher = ExhaustiveSearch(data2, scoring_method=K2Score(data2))
search = searcher.all_scores()
print('time:', time.time() - t0)

# Uncomment for printout:
#for score, model in search:
#    print("{0}        {1}".format(score, model.edges()))

separator()

hcs = HillClimbSearch(data2, scoring_method=K2Score(data))
model = hcs.estimate()

hcs2 = HillClimbSearch(data2, scoring_method=K2Score(data2))
model2 = hcs2.estimate()
class TestBaseEstimator(unittest.TestCase):
    def setUp(self):
        self.rand_data = pd.DataFrame(np.random.randint(0, 5, size=(5000, 2)),
                                      columns=list('AB'))
        self.rand_data['C'] = self.rand_data['B']
        self.est_rand = ExhaustiveSearch(self.rand_data)
        self.est_rand_bdeu = ExhaustiveSearch(self.rand_data,
                                              scoring_method=BdeuScore(
                                                  self.rand_data))
        self.est_rand_bic = ExhaustiveSearch(self.rand_data,
                                             scoring_method=BicScore(
                                                 self.rand_data))

        # link to dataset: "https://www.kaggle.com/c/titanic/download/train.csv"
        self.titanic_data = pd.read_csv(
            'pgmpy/tests/test_estimators/testdata/titanic_train.csv')
        self.titanic_data2 = self.titanic_data[["Survived", "Sex", "Pclass"]]
        self.est_titanic = ExhaustiveSearch(self.titanic_data2)

    def test_all_dags(self):
        self.assertEqual(
            len(list(self.est_rand.all_dags(['A', 'B', 'C', 'D']))), 543)
        # self.assertEqual(len(list(self.est_rand.all_dags(nodes=range(5)))), 29281)  # takes ~30s

        abc_dags = set(
            six.moves.map(
                tuple,
                [sorted(dag.edges()) for dag in self.est_rand.all_dags()]))
        abc_dags_ref = set([
            (('A', 'B'), ('C', 'A'), ('C', 'B')), (('A', 'C'), ('B', 'C')),
            (('B', 'A'), ('B', 'C')), (('C', 'B'), ), (('A', 'C'), ('B', 'A')),
            (('B', 'C'), ('C', 'A')), (('A', 'B'), ('B', 'C')),
            (('A', 'C'), ('B', 'A'), ('B', 'C')), (('A', 'B'), ),
            (('A', 'B'), ('C', 'A')), (('B', 'A'), ('C', 'A'), ('C', 'B')),
            (('A', 'C'), ('C', 'B')), (('A', 'B'), ('A', 'C'), ('C', 'B')),
            (('B', 'A'), ('C', 'B')), (('A', 'B'), ('A', 'C')),
            (('C', 'A'), ('C', 'B')), (('A', 'B'), ('A', 'C'), ('B', 'C')),
            (('C', 'A'), ), (('B', 'A'), ('B', 'C'), ('C', 'A')),
            (('B', 'A'), ), (('A', 'B'), ('C', 'B')), (),
            (('B', 'A'), ('C', 'A')), (('A', 'C'), ), (('B', 'C'), )
        ])
        self.assertSetEqual(abc_dags, abc_dags_ref)

    def test_estimate_rand(self):
        est = self.est_rand.estimate()
        self.assertSetEqual(set(est.nodes()), set(['A', 'B', 'C']))
        self.assertTrue(est.edges() == [('B', 'C')]
                        or est.edges() == [('C', 'B')])

        est_bdeu = self.est_rand.estimate()
        self.assertTrue(est_bdeu.edges() == [('B', 'C')]
                        or est_bdeu.edges() == [('C', 'B')])

        est_bic = self.est_rand.estimate()
        self.assertTrue(est_bic.edges() == [('B', 'C')]
                        or est_bic.edges() == [('C', 'B')])

    def test_estimate_titanic(self):
        e1 = self.est_titanic.estimate()
        self.assertSetEqual(
            set(e1.edges()),
            set([('Survived', 'Pclass'), ('Sex', 'Pclass'),
                 ('Sex', 'Survived')]))

    def test_all_scores(self):
        scores = self.est_titanic.all_scores()
        scores_ref = [(-2072.9132364404695, []),
                      (-2069.071694164769, [('Pclass', 'Sex')]),
                      (-2069.0144197068785, [('Sex', 'Pclass')]),
                      (-2025.869489762676, [('Survived', 'Pclass')]),
                      (-2025.8559302273054, [('Pclass', 'Survived')]),
                      (-2022.0279474869753, [('Pclass', 'Sex'),
                                             ('Survived', 'Pclass')]),
                      (-2022.0143879516047, [('Pclass', 'Sex'),
                                             ('Pclass', 'Survived')]),
                      (-2021.9571134937144, [('Pclass', 'Survived'),
                                             ('Sex', 'Pclass')]),
                      (-2017.5258065853768, [('Sex', 'Pclass'),
                                             ('Survived', 'Pclass')]),
                      (-1941.3075053892837, [('Survived', 'Sex')]),
                      (-1941.2720031713893, [('Sex', 'Survived')]),
                      (-1937.4304608956886, [('Pclass', 'Sex'),
                                             ('Sex', 'Survived')]),
                      (-1937.4086886556927, [('Sex', 'Pclass'),
                                             ('Survived', 'Sex')]),
                      (-1937.3731864377983, [('Sex', 'Pclass'),
                                             ('Sex', 'Survived')]),
                      (-1934.1344850608882, [('Pclass', 'Sex'),
                                             ('Survived', 'Sex')]),
                      (-1894.2637587114903, [('Survived', 'Pclass'),
                                             ('Survived', 'Sex')]),
                      (-1894.2501991761198, [('Pclass', 'Survived'),
                                             ('Survived', 'Sex')]),
                      (-1894.2282564935958, [('Sex', 'Survived'),
                                             ('Survived', 'Pclass')]),
                      (-1891.0630673606006, [('Pclass', 'Survived'),
                                             ('Sex', 'Survived')]),
                      (-1887.2215250849, [('Pclass', 'Sex'),
                                          ('Pclass', 'Survived'),
                                          ('Sex', 'Survived')]),
                      (-1887.1642506270096, [('Pclass', 'Survived'),
                                             ('Sex', 'Pclass'),
                                             ('Sex', 'Survived')]),
                      (-1887.0907383830947, [('Pclass', 'Sex'),
                                             ('Survived', 'Pclass'),
                                             ('Survived', 'Sex')]),
                      (-1887.0771788477243, [('Pclass', 'Sex'),
                                             ('Pclass', 'Survived'),
                                             ('Survived', 'Sex')]),
                      (-1885.9200755341915, [('Sex', 'Pclass'),
                                             ('Survived', 'Pclass'),
                                             ('Survived', 'Sex')]),
                      (-1885.884573316297, [('Sex', 'Pclass'),
                                            ('Sex', 'Survived'),
                                            ('Survived', 'Pclass')])]

        self.assertEqual([sorted(model.edges()) for score, model in scores],
                         [edges for score, edges in scores_ref])
        # use assertAlmostEqual pointwise to avoid rounding issues
        six.moves.map(lambda x, y: self.assertAlmostEqual(x, y),
                      [score for score, model in scores],
                      [score for score, edges in scores_ref])

    def tearDown(self):
        del self.rand_data
        del self.est_rand
        del self.est_rand_bdeu
        del self.est_rand_bic
        del self.titanic_data
        del self.est_titanic
class TestBaseEstimator(unittest.TestCase):
    def setUp(self):
        self.rand_data = pd.DataFrame(np.random.randint(0, 5, size=(5000, 2)), columns=list('AB'))
        self.rand_data['C'] = self.rand_data['B']
        self.est_rand = ExhaustiveSearch(self.rand_data)
        self.est_rand_bdeu = ExhaustiveSearch(self.rand_data, scoring_method=BdeuScore(self.rand_data))
        self.est_rand_bic = ExhaustiveSearch(self.rand_data, scoring_method=BicScore(self.rand_data))

        # link to dataset: "https://www.kaggle.com/c/titanic/download/train.csv"
        self.titanic_data = pd.read_csv('pgmpy/tests/test_estimators/testdata/titanic_train.csv')
        self.titanic_data2 = self.titanic_data[["Survived", "Sex", "Pclass"]]
        self.est_titanic = ExhaustiveSearch(self.titanic_data2)

    def test_all_dags(self):
        self.assertEqual(len(list(self.est_rand.all_dags(['A', 'B', 'C', 'D']))), 543)
        # self.assertEqual(len(list(self.est_rand.all_dags(nodes=range(5)))), 29281)  # takes ~30s

        abc_dags = set(six.moves.map(tuple, [sorted(dag.edges()) for dag in self.est_rand.all_dags()]))
        abc_dags_ref = set([(('A', 'B'), ('C', 'A'), ('C', 'B')), (('A', 'C'), ('B', 'C')),
                            (('B', 'A'), ('B', 'C')), (('C', 'B'),), (('A', 'C'), ('B', 'A')),
                            (('B', 'C'), ('C', 'A')), (('A', 'B'), ('B', 'C')), (('A', 'C'),
                            ('B', 'A'), ('B', 'C')), (('A', 'B'),), (('A', 'B'), ('C', 'A')),
                            (('B', 'A'), ('C', 'A'), ('C', 'B')), (('A', 'C'), ('C', 'B')),
                            (('A', 'B'), ('A', 'C'), ('C', 'B')), (('B', 'A'), ('C', 'B')),
                            (('A', 'B'), ('A', 'C')), (('C', 'A'), ('C', 'B')), (('A', 'B'),
                            ('A', 'C'), ('B', 'C')), (('C', 'A'),), (('B', 'A'), ('B', 'C'), ('C', 'A')),
                            (('B', 'A'),), (('A', 'B'), ('C', 'B')), (), (('B', 'A'), ('C', 'A')),
                            (('A', 'C'),), (('B', 'C'),)])
        self.assertSetEqual(abc_dags, abc_dags_ref)

    def test_estimate_rand(self):
        est = self.est_rand.estimate()
        self.assertSetEqual(set(est.nodes()), set(['A', 'B', 'C']))
        self.assertTrue(est.edges() == [('B', 'C')] or est.edges() == [('C', 'B')])

        est_bdeu = self.est_rand.estimate()
        self.assertTrue(est_bdeu.edges() == [('B', 'C')] or est_bdeu.edges() == [('C', 'B')])

        est_bic = self.est_rand.estimate()
        self.assertTrue(est_bic.edges() == [('B', 'C')] or est_bic.edges() == [('C', 'B')])

    def test_estimate_titanic(self):
        e1 = self.est_titanic.estimate()
        self.assertSetEqual(set(e1.edges()), set([('Survived', 'Pclass'), ('Sex', 'Pclass'), ('Sex', 'Survived')]))

    def test_all_scores(self):
        scores = self.est_titanic.all_scores()
        scores_ref = [(-2072.9132364404695, []),
                      (-2069.071694164769, [('Pclass', 'Sex')]),
                      (-2069.0144197068785, [('Sex', 'Pclass')]),
                      (-2025.869489762676, [('Survived', 'Pclass')]),
                      (-2025.8559302273054, [('Pclass', 'Survived')]),
                      (-2022.0279474869753, [('Pclass', 'Sex'), ('Survived', 'Pclass')]),
                      (-2022.0143879516047, [('Pclass', 'Sex'), ('Pclass', 'Survived')]),
                      (-2021.9571134937144, [('Pclass', 'Survived'), ('Sex', 'Pclass')]),
                      (-2017.5258065853768, [('Sex', 'Pclass'), ('Survived', 'Pclass')]),
                      (-1941.3075053892837, [('Survived', 'Sex')]),
                      (-1941.2720031713893, [('Sex', 'Survived')]),
                      (-1937.4304608956886, [('Pclass', 'Sex'), ('Sex', 'Survived')]),
                      (-1937.4086886556927, [('Sex', 'Pclass'), ('Survived', 'Sex')]),
                      (-1937.3731864377983, [('Sex', 'Pclass'), ('Sex', 'Survived')]),
                      (-1934.1344850608882, [('Pclass', 'Sex'), ('Survived', 'Sex')]),
                      (-1894.2637587114903, [('Survived', 'Pclass'), ('Survived', 'Sex')]),
                      (-1894.2501991761198, [('Pclass', 'Survived'), ('Survived', 'Sex')]),
                      (-1894.2282564935958, [('Sex', 'Survived'), ('Survived', 'Pclass')]),
                      (-1891.0630673606006, [('Pclass', 'Survived'), ('Sex', 'Survived')]),
                      (-1887.2215250849, [('Pclass', 'Sex'), ('Pclass', 'Survived'), ('Sex', 'Survived')]),
                      (-1887.1642506270096, [('Pclass', 'Survived'), ('Sex', 'Pclass'), ('Sex', 'Survived')]),
                      (-1887.0907383830947, [('Pclass', 'Sex'), ('Survived', 'Pclass'), ('Survived', 'Sex')]),
                      (-1887.0771788477243, [('Pclass', 'Sex'), ('Pclass', 'Survived'), ('Survived', 'Sex')]),
                      (-1885.9200755341915, [('Sex', 'Pclass'), ('Survived', 'Pclass'), ('Survived', 'Sex')]),
                      (-1885.884573316297, [('Sex', 'Pclass'), ('Sex', 'Survived'), ('Survived', 'Pclass')])]

        self.assertEqual([sorted(model.edges()) for score, model in scores],
                         [edges for score, edges in scores_ref])
        # use assertAlmostEqual pointwise to avoid rounding issues
        six.moves.map(lambda x, y: self.assertAlmostEqual(x, y),
                      [score for score, model in scores],
                      [score for score, edges in scores_ref])

    def tearDown(self):
        del self.rand_data
        del self.est_rand
        del self.est_rand_bdeu
        del self.est_rand_bic
        del self.titanic_data
        del self.est_titanic
Example #11
0
from pgmpy.estimators import ExhaustiveSearch, K2Score

if __name__ == '__main__':
#     fp = os.path.join('data', 'MTurk_Harvey.csv')
#     df = pd.read_csv(fp)
#     data = np.genfromtxt(fp, delimiter=",", dtype=float, skip_header=1)
#     x = data[:,:-1]
#     y = data[:,-1]
    
#     data = pd.DataFrame(np.random.randint(0, 5, size=(2500, 3)), columns=list('XYZ'))
#     data['sum'] = data.sum(axis=1)
#     #print(data)
    
#     est = ConstraintBasedEstimator(data)
#     skel, sep_sets = est.estimate_skeleton()
#     print(skel.edges())

#     s = ExhaustiveSearch(pd.DataFrame(data={'Temperature': [23, 19],'Weather': ['sunny', 'cloudy'],'Humidity': [65, 75]}))
#     print(len(list(s.all_dags())))
#     for dag in s.all_dags():
#         print(dag.edges())
        
    data = pd.DataFrame(np.random.randint(0, 5, size=(5000, 2)), columns=list('AB'))
    data['C'] = data['B']
    searcher = ExhaustiveSearch(data, scoring_method=K2Score(data))
    for score, model in searcher.all_scores():
        print score
        print model.edges()
        

print(k2Fruit.local_score(variable='tasty', parents=['size', 'fruit']))
print(bicFruit.local_score(variable='tasty', parents=['size', 'fruit']))

# %% markdown [markdown]
# ### Search Strategies
# The search space of DAGs is super-exponential in the number of variables and the above scoring functions allow for local maxima. The first property makes exhaustive search intractable for all but very small networks, the second prohibits efficient local optimization algorithms to always find the optimal structure. Thus, identifiying the ideal structure is often not tractable. Despite these bad news, heuristic search strategies often yields good results.
#
# If only few nodes are involved (read: less than 5), ExhaustiveSearch can be used to compute the score for every DAG and returns the best-scoring one:

# #### Exhaustive Search
# **Example 1:** $Z + X + Y$
# %% codecell
from pgmpy.estimators import ExhaustiveSearch
from pgmpy.base.DAG import DAG

es: ExhaustiveSearch = ExhaustiveSearch(data=data, scoring_method=bic)
bestModel: DAG = es.estimate()

bestModel.edges()
# %% codecell
# The best model (structurally estimated):
drawGraph(bestModel, nodeColor=LIGHT_GREEN)

# %% codecell
# Computing scores for all structurally analyzed DAGS:

print("All DAGs sorted by score:\n")

for score, dag in reversed(es.all_scores()):
    print(f"Score = {score},   Edges: {dag.edges()}")
# %% markdown [markdown]