cluster_penalty=cluster_penalty, n_cluster_splits=args.n_row_clusters, n_iters=args.n_iters, n_restarts=args.n_restarts, sklearn_args=sklearn_args, cltree_leaves=cltree_leaves, rand_gen=numpy_rand_gen) learn_start_t = perf_counter() # # build an spn on the training set #spn = learner.fit_structure(data=train, # feature_sizes=features) spn = learner.fit_structure_bagging(data=train, feature_sizes=features, n_components=10) learn_end_t = perf_counter() print('Structure learned in', learn_end_t - learn_start_t, 'secs') # # print(spn) # # gathering statistics n_edges = spn.n_edges() n_levels = spn.n_layers() n_weights = spn.n_weights() n_leaves = spn.n_leaves()
def learn_model(self, cltree_leaves, args, comp, bgg): #set parameters for learning AC (cltree_leaves=True)and AL(cltree_leaves=false) print('-------MODELS CONSTRUCTION-----------') verbose = 1 n_row_clusters = 2 cluster_method = 'GMM' seed = 1337 n_iters = 100 n_restarts = 4 cluster_penalties = [1.0] sklearn_Args = None if not args: g_factors = [5, 10, 15] min_inst_slices = [10, 50, 100] alphas = [0.1, 0.5, 1.0, 2.0] else: g_factors = [args[0]] min_inst_slices = [args[1]] alphas = [args[2]] # setting verbosity level if verbose == 1: logging.basicConfig(level=logging.INFO) elif verbose == 2: logging.basicConfig(level=logging.DEBUG) # logging.info("Starting with arguments:\n") if sklearn_Args is not None: sklearn_key_value_pairs = sklearn_translate({ ord('['): '', ord(']'): '' }).split(',') sklearn_args = { key.strip(): value.strip() for key, value in [pair.strip().split('=') for pair in sklearn_key_value_pairs] } else: sklearn_args = {} # logging.info(sklearn_args) # initing the random generators MAX_RAND_SEED = 99999999 # sys.maxsize rand_gen = random.Random(seed) numpy_rand_gen = numpy.random.RandomState(seed) # # elaborating the dataset # dataset_name = self.dataset # logging.info('Loading datasets: %s', dataset_name) train = self.train n_instances = train.shape[0] # # estimating the frequencies for the features # logging.info('') freqs, features = dataset.data_2_freqs(train) best_train_avg_ll = NEG_INF best_state = {} best_test_lls = None index = 0 spns = [] for g_factor in g_factors: for cluster_penalty in cluster_penalties: for min_inst_slice in min_inst_slices: print('model') # Creating the structure learner learner = LearnSPN( g_factor=g_factor, min_instances_slice=min_inst_slice, # alpha=alpha, row_cluster_method=cluster_method, cluster_penalty=cluster_penalty, n_cluster_splits=n_row_clusters, n_iters=n_iters, n_restarts=n_restarts, sklearn_args=sklearn_args, cltree_leaves=cltree_leaves, rand_gen=numpy_rand_gen) learn_start_t = perf_counter() # build an spn on the training set if (bgg): spn = learner.fit_structure_bagging( data=train, feature_sizes=features, n_components=comp) else: spn = learner.fit_structure(data=train, feature_sizes=features) learn_end_t = perf_counter() n_edges = spn.n_edges() n_levels = spn.n_layers() n_weights = spn.n_weights() n_leaves = spn.n_leaves() # # smoothing can be done after the spn has been built for alpha in alphas: # logging.info('Smoothing leaves with alpha = %f', alpha) spn.smooth_leaves(alpha) spns.append(spn) # Compute LL on training set # logging.info('Evaluating on training set') train_ll = 0.0 for instance in train: (pred_ll, ) = spn.eval(instance) train_ll += pred_ll train_avg_ll = train_ll / train.shape[0] # updating best stats according to train ll if train_avg_ll > best_train_avg_ll: best_train_avg_ll = train_avg_ll best_state['alpha'] = alpha best_state['min_inst_slice'] = min_inst_slice best_state['g_factor'] = g_factor best_state['cluster_penalty'] = cluster_penalty best_state['train_ll'] = train_avg_ll best_state['index'] = index best_state['name'] = self.dataset # writing to file a line for the grid # stats = stats_format([g_factor, # cluster_penalty, # min_inst_slice, # alpha, # n_edges, n_levels, # n_weights, n_leaves, # train_avg_ll], # '\t', # digits=5) # index = index + 1 best_spn = spns[best_state['index']] # logging.info('Grid search ended.') # logging.info('Best params:\n\t%s', best_state) return best_spn, best_state['g_factor'], best_state[ 'min_inst_slice'], best_state['alpha']