import os import numpy from mlutils.datasets import loadMLC OUTPUT_DIR = 'exp/mutualinfo/' os.makedirs(OUTPUT_DIR, exist_ok=True) mem_map_path = os.path.join(OUTPUT_DIR, 'mem.map') ((train, valid, test), feature_names, feature_types, domains) = loadMLC("autism", data_dir="datasets/autism/proc/unique") nfeatures = len(feature_types) mem_map = numpy.memmap(mem_map_path, dtype='float', mode='w+', shape=(nfeatures, nfeatures)) for i in range(nfeatures): for j in range(nfeatures): if j <= i: continue print("python3 experiments/mutualinformation/mitest.py '%s' %s %s" % (mem_map_path, i, j))
dtype=numpy.float64) orig_train, orig_valid, orig_test = orig_fold_splits[0] orig_train_x, orig_train_y = orig_train orig_valid_x, orig_valid_y = orig_valid orig_test_x, orig_test_y = orig_test # # load augmented mnist in MLC format dataset_name = args.dataset logging.info('Looking for dataset {} ...in dir {}'.format( dataset_name, args.data_dir)) (train, valid, test), feature_names, feature_types, domains = loadMLC(dataset_name, base_path='', data_dir=args.data_dir) logging.info('Loaded\n\ttrain:\t{}\n\tvalid:\t{}\n\ttest:\t{}'.format( train.shape, valid.shape, test.shape)) load_start_t = perf_counter() # spn = SPN.FromFile(args.spn) spn = SPN.from_pickle(args.spn) # spn = None load_end_t = perf_counter() # print(spn) logging.info('spn loaded from pickle in {} secs'.format(load_end_t - load_start_t)) logging.info('\n\nstructure stats:') n_nodes = spn.n_nodes()
def test_piecewise_marginals(): from mlutils.datasets import loadMLC from tfspn.piecewise import piecewise_linear_approximation, estimate_bins, estimate_domains from tfspn.tfspn import SumNode, ProductNode, PoissonNode, GaussianNode, BernoulliNode, \ PiecewiseLinearPDFNode # # loading australian (train, test, valid), fnames, ftypes, domains = loadMLC('australian') print(train.shape) print(test.shape) print(valid.shape) for fn, ft, fd in zip(fnames, ftypes, domains): print(fn, ft, fd[:2], fd[-2:]) # # some continuous features # A2, A3, A7, A10 # c_feature_ids = [1, 2, 6, 9] c_feature_ids = [1] # # n_bins = 100 for i in c_feature_ids: train_data = train[:, i] valid_data = valid[:, i] test_data = test[:, i] # pyplot.hist(train_data, bins=n_bins, alpha=0.4, label='train', normed=True) # pyplot.hist(valid_data, bins=n_bins, alpha=0.4, label='valid', normed=True) # pyplot.hist(test_data, bins=n_bins, alpha=0.4, label='test', normed=True) # pyplot.legend(loc='upper right') # pyplot.show() # # creating a piecewise node print('looking at feature', i, ftypes[i], domains[i]) # bins = estimate_bins(train_data, ftypes[i], domains[i]) print(train_data.min(), train_data.max()) # print('computed bins', bins) smoothing = 1 print('domains', domains[i]) bins = estimate_bins(train_data, ftypes[i], [domains[i]]) print('bins from domains', bins) x_range, y_range = piecewise_linear_approximation( train_data, bins=bins, family=ftypes[i], alpha=smoothing, # isotonic=True, # n_bootstraps=nb, # average_bootstraps=False ) # print("PiecewiseLinearPDFNode") node = PiecewiseLinearPDFNode("PiecewiseLinearNode_{}".format(i), i, fnames[i], domains[i], x_range, y_range) print(node) # # compute likelihoods train_lls = node.eval(train) valid_lls = node.eval(valid) test_lls = node.eval(test) print('TRAIN LL:', train_lls.mean()) print('VALID LL:', valid_lls.mean()) print('TEST LL:', test_lls.mean()) v_ids = valid_data > 76.75 print(sum(v_ids), valid_lls[v_ids]) t_ids = test_data > 76.75 print(sum(t_ids), test_lls[t_ids]) print(test_lls) print(test_lls[~t_ids].mean())
# PRODUCT_FIRST = [False, True] OUTPUT_DIR = './exp/learnspn/' # # opening the memmap for dataset in DATASETS: for bins in BINNING_METHODS: exp_id = 0 (train, valid, test), _f_names, _f_types, _f_domains = loadMLC(dataset, base_path=DATA_DIR, data_dir=bins) data_dir = os.path.join(DATA_DIR, bins) output_dir = os.path.join(OUTPUT_DIR, bins) # mem_map_base_path = os.path.join(BASE_PATH, dataset, bins) mem_map_base_path = os.path.join(output_dir, dataset) mem_map_path = os.path.join(mem_map_base_path, 'mem.map') os.makedirs(mem_map_base_path, exist_ok=True) configurations = itertools.product( ROW_SPLIT_METHODS, COL_SPLIT_METHODS, MIN_INST_SLICES, ALPHAS, LEAVES, PRIOR_WEIGHTS # ISOTONIC, # N_BOOTSTRAPS, # AVERAGE_BOOTSTRAPS,
# dsname = "breast" # dsname = "cars" # dsname = "cleve" # dsname = "crx" # dsname = "diabetes" # dsname = "german-org" # dsname = "glass" # dsname = "glass2" # dsname = "heart" # dsname = "iris" result = [] # , "balance-scale", "breast", "cars", "cleve", "crx", "diabetes", "german-org", "glass", "glass2", "heart", "iris"]: for dsname in ["auto"]: (train, test, valid), feature_names, feature_types, domains = loadMLC( dsname, data_dir='datasets/MLC/proc-db/proc/auto/') # # train = train[:,(1,9)] # test = test[:,(1,9)] # valid = valid[:,(1,9)] # feature_names = [feature_names[1], feature_names[9]] # feature_types = [feature_types[1], feature_types[9]] # domains = [domains[1], domains[9]] # # import matplotlib.pyplot as plt # # fig = plt.figure() # ax = fig.add_subplot(111) # # plt.hist(train[:,0], bins=100, histtype='stepfilled', normed=True, color='b', label='Gaussian')