Exemple #1
0
import os

import numpy

from mlutils.datasets import loadMLC

OUTPUT_DIR = 'exp/mutualinfo/'

os.makedirs(OUTPUT_DIR, exist_ok=True)

mem_map_path = os.path.join(OUTPUT_DIR, 'mem.map')

((train, valid, test), feature_names, feature_types,
 domains) = loadMLC("autism", data_dir="datasets/autism/proc/unique")

nfeatures = len(feature_types)

mem_map = numpy.memmap(mem_map_path,
                       dtype='float',
                       mode='w+',
                       shape=(nfeatures, nfeatures))

for i in range(nfeatures):
    for j in range(nfeatures):
        if j <= i:
            continue

        print("python3 experiments/mutualinformation/mitest.py '%s' %s %s" %
              (mem_map_path, i, j))
Exemple #2
0
                                              dtype=numpy.float64)

orig_train, orig_valid, orig_test = orig_fold_splits[0]

orig_train_x, orig_train_y = orig_train
orig_valid_x, orig_valid_y = orig_valid
orig_test_x, orig_test_y = orig_test

#
# load augmented mnist in MLC format
dataset_name = args.dataset
logging.info('Looking for dataset {} ...in dir {}'.format(
    dataset_name, args.data_dir))
(train, valid,
 test), feature_names, feature_types, domains = loadMLC(dataset_name,
                                                        base_path='',
                                                        data_dir=args.data_dir)
logging.info('Loaded\n\ttrain:\t{}\n\tvalid:\t{}\n\ttest:\t{}'.format(
    train.shape, valid.shape, test.shape))

load_start_t = perf_counter()
# spn = SPN.FromFile(args.spn)
spn = SPN.from_pickle(args.spn)
# spn = None
load_end_t = perf_counter()
# print(spn)
logging.info('spn loaded from pickle in {} secs'.format(load_end_t -
                                                        load_start_t))

logging.info('\n\nstructure stats:')
n_nodes = spn.n_nodes()
def test_piecewise_marginals():

    from mlutils.datasets import loadMLC
    from tfspn.piecewise import piecewise_linear_approximation, estimate_bins, estimate_domains
    from tfspn.tfspn import SumNode, ProductNode, PoissonNode, GaussianNode, BernoulliNode, \
        PiecewiseLinearPDFNode

    #
    # loading australian
    (train, test, valid), fnames, ftypes, domains = loadMLC('australian')

    print(train.shape)
    print(test.shape)
    print(valid.shape)

    for fn, ft, fd in zip(fnames, ftypes, domains):
        print(fn, ft, fd[:2], fd[-2:])

    #
    # some continuous features
    # A2, A3, A7, A10
    # c_feature_ids = [1, 2, 6, 9]
    c_feature_ids = [1]

    #
    #
    n_bins = 100
    for i in c_feature_ids:

        train_data = train[:, i]
        valid_data = valid[:, i]
        test_data = test[:, i]

        # pyplot.hist(train_data, bins=n_bins, alpha=0.4, label='train', normed=True)
        # pyplot.hist(valid_data, bins=n_bins, alpha=0.4, label='valid', normed=True)
        # pyplot.hist(test_data, bins=n_bins, alpha=0.4, label='test', normed=True)
        # pyplot.legend(loc='upper right')
        # pyplot.show()

        #
        # creating a piecewise node
        print('looking at feature', i, ftypes[i], domains[i])
        # bins = estimate_bins(train_data, ftypes[i], domains[i])
        print(train_data.min(), train_data.max())
        # print('computed bins', bins)
        smoothing = 1
        print('domains', domains[i])
        bins = estimate_bins(train_data, ftypes[i], [domains[i]])
        print('bins from domains', bins)
        x_range, y_range = piecewise_linear_approximation(
            train_data,
            bins=bins,
            family=ftypes[i],
            alpha=smoothing,
            # isotonic=True,
            # n_bootstraps=nb,
            # average_bootstraps=False
        )
        # print("PiecewiseLinearPDFNode")
        node = PiecewiseLinearPDFNode("PiecewiseLinearNode_{}".format(i), i,
                                      fnames[i], domains[i], x_range, y_range)
        print(node)

        #
        # compute likelihoods
        train_lls = node.eval(train)
        valid_lls = node.eval(valid)
        test_lls = node.eval(test)

        print('TRAIN LL:', train_lls.mean())
        print('VALID LL:', valid_lls.mean())
        print('TEST LL:', test_lls.mean())

        v_ids = valid_data > 76.75
        print(sum(v_ids), valid_lls[v_ids])

        t_ids = test_data > 76.75
        print(sum(t_ids), test_lls[t_ids])
        print(test_lls)
        print(test_lls[~t_ids].mean())
Exemple #4
0
# PRODUCT_FIRST = [False, True]

OUTPUT_DIR = './exp/learnspn/'

#
# opening the memmap

for dataset in DATASETS:

    for bins in BINNING_METHODS:

        exp_id = 0

        (train, valid,
         test), _f_names, _f_types, _f_domains = loadMLC(dataset,
                                                         base_path=DATA_DIR,
                                                         data_dir=bins)
        data_dir = os.path.join(DATA_DIR, bins)
        output_dir = os.path.join(OUTPUT_DIR, bins)

        # mem_map_base_path = os.path.join(BASE_PATH, dataset, bins)
        mem_map_base_path = os.path.join(output_dir, dataset)
        mem_map_path = os.path.join(mem_map_base_path, 'mem.map')
        os.makedirs(mem_map_base_path, exist_ok=True)

        configurations = itertools.product(
            ROW_SPLIT_METHODS, COL_SPLIT_METHODS, MIN_INST_SLICES, ALPHAS,
            LEAVES, PRIOR_WEIGHTS
            # ISOTONIC,
            # N_BOOTSTRAPS,
            # AVERAGE_BOOTSTRAPS,
Exemple #5
0
# dsname = "breast"
# dsname = "cars"
# dsname = "cleve"
# dsname = "crx"
# dsname = "diabetes"
# dsname = "german-org"
# dsname = "glass"
# dsname = "glass2"
# dsname = "heart"
# dsname = "iris"

result = []
# , "balance-scale", "breast", "cars", "cleve", "crx", "diabetes", "german-org", "glass", "glass2", "heart", "iris"]:
for dsname in ["auto"]:

    (train, test, valid), feature_names, feature_types, domains = loadMLC(
        dsname, data_dir='datasets/MLC/proc-db/proc/auto/')

    #
    # train = train[:,(1,9)]
    # test = test[:,(1,9)]
    # valid = valid[:,(1,9)]
    # feature_names = [feature_names[1], feature_names[9]]
    # feature_types = [feature_types[1], feature_types[9]]
    # domains = [domains[1], domains[9]]
    #
    # import matplotlib.pyplot as plt
    #
    # fig = plt.figure()
    # ax = fig.add_subplot(111)
    #
    # plt.hist(train[:,0], bins=100, histtype='stepfilled', normed=True, color='b', label='Gaussian')