def main():

    logger = logging.getLogger()
    logger.addHandler(logging.StreamHandler())
    logger.setLevel(logging.INFO)

    bayesianpy.jni.attach(logger)

    db_folder = bayesianpy.utils.get_path_to_parent_dir(__file__)
    iris = pd.read_csv(os.path.join(db_folder, "data/iris.csv"),
                       index_col=False)

    network = bayesianpy.network.create_network()
    cluster = builder.create_cluster_variable(network, 4)
    node = builder.create_multivariate_continuous_node(
        network,
        iris.drop('iris_class', axis=1).columns.tolist(), "joint")
    builder.create_link(network, cluster, node)

    class_variable = builder.create_discrete_variable(
        network, iris, 'iris_class', iris['iris_class'].unique())
    builder.create_link(network, cluster, class_variable)

    head_variables = [
        'sepal_length', 'sepal_width', 'petal_length', 'petal_width'
    ]

    with bayesianpy.data.DataSet(iris, db_folder, logger) as dataset:
        model = bayesianpy.model.NetworkModel(network, logger)
        model.train(dataset)

        queries = [
            bayesianpy.model.QueryConditionalJointProbability(
                head_variables=[v], tail_variables=['iris_class'])
            for v in head_variables
        ]

        (engine, _, _) = bayesianpy.model.InferenceEngine(network).create()
        query = bayesianpy.model.SingleQuery(network, engine, logger)
        results = query.query(queries, aslist=True)
        jd = bayesianpy.visual.JointDistribution()
        fig = plt.figure(figsize=(10, 10))

        for i, r in enumerate(list(results)):
            ax = fig.add_subplot(2, 2, i + 1)
            jd.plot_distribution_with_variance(ax, iris,
                                               queries[i].get_head_variables(),
                                               r)

        plt.show()
def main():

    logger = logging.getLogger()
    logger.addHandler(logging.StreamHandler())
    logger.setLevel(logging.INFO)

    bayesianpy.jni.attach(logger)

    db_folder = bayesianpy.utils.get_path_to_parent_dir(__file__)
    iris = pd.read_csv(os.path.join(db_folder, "data/iris.csv"),
                       index_col=False)

    network = bayesianpy.network.create_network()
    cluster = builder.create_cluster_variable(network, 4)
    node = builder.create_multivariate_continuous_node(
        network,
        iris.drop('iris_class', axis=1).columns.tolist(), "joint")
    builder.create_link(network, cluster, node)

    class_variable = builder.create_discrete_variable(
        network, iris, 'iris_class', iris['iris_class'].unique())
    builder.create_link(network, cluster, class_variable)

    jd = bayesianpy.visual.JointDistribution()

    def plot(head_variables, results):

        fig = plt.figure(figsize=(10, 10))
        n = len(head_variables) - 1
        total = n * (n + 1) / 2

        k = 1
        for i, hv in enumerate(head_variables):
            for j in range(i + 1, len(head_variables)):
                ax = fig.add_subplot(total / 2, 2, k)
                jd.plot_distribution_with_covariance(
                    ax, iris, (head_variables[i], head_variables[j]), results)

                k += 1
        plt.show()

    with bayesianpy.data.DataSet(iris, db_folder, logger) as dataset:
        model = bayesianpy.model.NetworkModel(network, logger)
        model.train(dataset)

        head_variables = [
            'sepal_length', 'sepal_width', 'petal_length', 'petal_width'
        ]

        query_type_class = bayesianpy.model.QueryConditionalJointProbability(
            head_variables=head_variables,
            tail_variables=['iris_class', 'Cluster'])

        (engine, _, _) = bayesianpy.model.InferenceEngine(network).create()
        query = bayesianpy.model.Query(network, engine, logger)
        results_class = query.execute([query_type_class])

        plot(head_variables, results_class)

        query_type_cluster = bayesianpy.model.QueryConditionalJointProbability(
            head_variables=head_variables, tail_variables=['Cluster'])

        results_cluster = query.execute([query_type_cluster])

        plot(head_variables, results_cluster)
def main():

    logger = logging.getLogger()
    logger.addHandler(logging.StreamHandler())
    logger.setLevel(logging.INFO)

    bayesianpy.jni.attach(logger)

    db_folder = bayesianpy.utils.get_path_to_parent_dir(__file__)
    iris = pd.read_csv(os.path.join(db_folder, "data/iris.csv"),
                       index_col=False)

    # manually build the network, leaving out the 'iris-class' variable
    network = bayesianpy.network.create_network()
    cluster = builder.create_cluster_variable(network, 4)
    node = builder.create_multivariate_continuous_node(network,
                                                       iris.columns.tolist(),
                                                       "joint")
    builder.create_link(network, cluster, node)

    with bayesianpy.data.DataSet(iris.drop('iris_class', axis=1), db_folder,
                                 logger) as dataset:

        # build the 'normal' model on two of the classes
        model = bayesianpy.model.NetworkModel(network, logger)

        subset = dataset.subset(
            iris[(iris.iris_class == "Iris-versicolor") |
                 (iris.iris_class == "Iris-virginica")].index.tolist())

        model.train(subset)

        variables = [
            'sepal_length', 'sepal_width', 'petal_length', 'petal_width'
        ]

        # query the trained model on all the data, including the Iris-setosa class

        # get the loglikelihood value for the whole model on each individual sample,
        # the lower the loglikelihood value the less likely the data point has been
        # generated by the model.
        results = model.batch_query(dataset,
                                    [bayesianpy.model.QueryModelStatistics()])
        cmap = plt.cm.get_cmap('Blues_r')
        fig = plt.figure(figsize=(10, 10))
        k = 1
        for i, v in enumerate(variables):
            for j in range(i + 1, len(variables)):
                v1 = variables[j]
                ax = fig.add_subplot(3, 2, k)
                ax.set_title("{} vs {}".format(v, v1))
                h = ax.scatter(x=iris[v].tolist(),
                               y=iris[v1].tolist(),
                               c=results['loglikelihood'].tolist(),
                               vmin=results.loglikelihood.min(),
                               vmax=results.loglikelihood.max(),
                               cmap=cmap)
                k += 1

        fig.subplots_adjust(right=0.8)
        cbar_ax = fig.add_axes([0.85, 0.15, 0.05, 0.7])
        fig.colorbar(h, cax=cbar_ax)
        plt.show()
Exemple #4
0
def main():

    logger = logging.getLogger()
    logger.addHandler(logging.StreamHandler())
    logger.setLevel(logging.INFO)

    bayesianpy.jni.attach(logger)

    db_folder = bayesianpy.utils.get_path_to_parent_dir(__file__)
    iris = pd.read_csv(os.path.join(db_folder, "data/iris.csv"),
                       index_col=False)

    network = bayesianpy.network.create_network()
    num_clusters = 3
    cluster = builder.create_cluster_variable(network, num_clusters)
    node = builder.create_multivariate_continuous_node(
        network,
        iris.drop('iris_class', axis=1).columns.tolist(), "joint")
    builder.create_link(network, cluster, node)

    class_variable = builder.create_discrete_variable(
        network, iris, 'iris_class', iris['iris_class'].unique())
    builder.create_link(network, cluster, class_variable)

    train, test = train_test_split(iris, test_size=0.7)

    # train the model and query the most likely states and probability of each latent state.
    with bayesianpy.data.DataSet(iris, db_folder, logger) as dataset:
        model = bayesianpy.model.NetworkModel(network, logger)
        model.train(dataset.subset(train.index.tolist()))

        test_subset = dataset.subset(test.index.tolist())

        results = model.batch_query(
            test_subset,
            # creates columns Cluster$$Cluster0, Cluster$$Cluster1,
            # Cluster$$Cluster2, as
            # suffix is set to an empty string.
            [
                bayesianpy.model.QueryStateProbability("Cluster", suffix=""),
                # creates column 'iris_class_maxlikelihood'
                bayesianpy.model.QueryMostLikelyState("iris_class"),
                # creates column 'Cluster_maxlikelihood'
                bayesianpy.model.QueryMostLikelyState("Cluster")
            ])

    cluster_accuracy = {}
    # get a list of cluster accuracies, using the Bayes Server Confusion matrix class
    # weighted by the Cluster accuracy.
    with bayesianpy.data.DataSet(results, db_folder, logger) as resultset:
        for c in range(num_clusters):
            matrix = bayesianpy.jni.bayesServerAnalysis()\
                .ConfusionMatrix.create(resultset.create_data_reader_command(), "iris_class",
                                        "iris_class_maxlikelihood", "Cluster$$Cluster{}".format(c))
            cluster_accuracy.update(
                {'Cluster{}'.format(c): matrix.getAccuracy()})

    # generate samples from the trained model, to give us some additional testing data.
    samples = bayesianpy.model.Sampling(network).sample(num_samples=20).drop(
        ["Cluster", "iris_class"], axis=1)
    reader = bayesianpy.data.DataFrameReader(samples)
    inference = bayesianpy.model.InferenceEngine(network).create_engine()
    evidence = bayesianpy.model.Evidence(network, inference)
    query = bayesianpy.model.SingleQuery(network, inference, logger)
    query_type = [bayesianpy.model.QueryStateProbability('Cluster', suffix="")]

    # query the expected Cluster membership, and generate a wrapper for
    # comparing the values, weighted by cluster membership.
    while reader.read():
        result = query.query(query_type,
                             evidence=evidence.apply(reader.to_dict()))
        cv_results = []
        for i, (key, value) in enumerate(result.items()):
            n = bayesianpy.network.Discrete.fromstring(key)
            weighting = cluster_accuracy[n.state]
            cv_results.append(bayesianpy.jni.bayesServerAnalysis().
                              DefaultCrossValidationTestResult(
                                  jp.JDouble(weighting),
                                  jp.JObject(value, jp.java.lang.Object),
                                  jp.java.lang.Double(jp.JDouble(value))))

        score = bayesianpy.jni.bayesServerAnalysis().CrossValidation.combine(
            jp.java.util.Arrays.asList(cv_results),
            bayesianpy.jni.bayesServerAnalysis().CrossValidationCombineMethod.
            WEIGHTED_AVERAGE)

        # append the score on to the existing dataframe
        samples.set_value(reader.get_index(), 'score', score)

    variables = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']

    cmap = plt.cm.get_cmap('Blues')
    fig = plt.figure(figsize=(10, 10))
    k = 1
    # plot!
    for i, v in enumerate(variables):
        for j in range(i + 1, len(variables)):
            v1 = variables[j]
            ax = fig.add_subplot(3, 2, k)
            ax.set_title("{} vs {}".format(v, v1))
            ax.scatter(x=iris[v].tolist(),
                       y=iris[v1].tolist(),
                       facecolors='none',
                       alpha=0.1)
            h = ax.scatter(x=samples[v].tolist(),
                           y=samples[v1].tolist(),
                           c=samples['score'].tolist(),
                           vmin=samples.score.min(),
                           vmax=samples.score.max(),
                           cmap=cmap)
            k += 1

    fig.subplots_adjust(right=0.8)
    cbar_ax = fig.add_axes([0.85, 0.15, 0.05, 0.7])
    fig.colorbar(h, cax=cbar_ax)
    plt.show()