valid_preds = evaluate_on_dataset(spn, valid) assert valid_preds.shape[0] == valid.shape[0] valid_avg_ll = numpy.mean(valid_preds) logging.info('\t{}'.format(valid_avg_ll)) if test is not None: logging.info('Evaluating on test set') test_preds = evaluate_on_dataset(spn, test) assert test_preds.shape[0] == test.shape[0] test_avg_ll = numpy.mean(test_preds) logging.info('\t{}'.format(test_avg_ll)) # # writing to file stats = stats_format([train_avg_ll, valid_avg_ll, test_avg_ll], '\t', digits=5) out_log.write(stats + '\n') out_log.flush() # # also serializing the split predictions train_lls_path = os.path.join(out_path, TRAIN_PREDS_EXT) numpy.savetxt(train_lls_path, train_preds, delimiter='\n') if valid is not None: valid_lls_path = os.path.join(out_path, VALID_PREDS_EXT) numpy.savetxt(valid_lls_path, valid_preds, delimiter='\n') if test is not None: test_lls_path = os.path.join(out_path, TEST_PREDS_EXT)
# # checking for improvements on validation if valid_avg_pll > best_valid_avg_pll: best_valid_avg_pll = valid_avg_pll best_model = rbm best_params['n-hidden'] = n_hidden best_params['learning-rate'] = l_rate best_params['batch-size'] = batch_size best_params['n-iters'] = n_iters best_test_plls = test_plls # # saving the model if args.save_model: prefix_str = stats_format( [n_hidden, l_rate, batch_size, n_iters], '_', digits=5) model_path = os.path.join( out_path, 'best.{0}.{1}'.format( dataset_name, MODEL_EXT)) with open(model_path, 'wb') as model_file: pickle.dump(rbm, model_file) logging.info( 'Dumped RBM to {}'.format(model_path)) # # writing to file a line for the grid stats = stats_format([ n_hidden, l_rate, batch_size, n_iters, train_avg_pll, valid_avg_pll, test_avg_pll ],
best_state['min-inst-slice'] = min_inst_slice best_state['g-factor'] = g_factor best_state['cluster-penalty'] = cluster_penalty best_state['train_ll'] = train_avg_ll best_state['valid_ll'] = valid_avg_ll best_state['test_ll'] = test_avg_ll best_test_lls = test_lls # # writing to file a line for the grid stats = stats_format([g_factor, cluster_penalty, min_inst_slice, alpha, n_edges, n_levels, n_weights, n_leaves, train_avg_ll, valid_avg_ll, test_avg_ll], '\t', digits=5) out_log.write(stats + '\n') out_log.flush() # # writing as last line the best params out_log.write("{0}".format(best_state)) out_log.flush() # # saving the best test_lls
best_state['alpha'] = alpha best_state['min-inst-slice'] = min_inst_slice best_state['g-factor'] = g_factor best_state['cluster-penalty'] = cluster_penalty best_state['train_ll'] = train_avg_ll best_state['index'] = index index = index + 1 # # writing to file a line for the grid stats = stats_format([g_factor, cluster_penalty, min_inst_slice, alpha, n_edges, n_levels, n_weights, n_leaves, train_avg_ll, valid_avg_ll, test_avg_ll], '\t', digits=5) out_log.write(stats + '\n') out_log.flush() # # writing as last line the best params out_log.write("{0}".format(best_state)) out_log.flush() #
for i in range(len(fold_splits)): train_score = train_a_lls[i] if train_a_lls else NEG_INF valid_score = valid_a_lls[i] if valid_a_lls else NEG_INF test_score = test_a_lls[i] if test_a_lls else NEG_INF # # writing to file a line for the grid stats = stats_format([ g_factor, cluster_penalty, min_inst_slice, alpha, entropy_threshold, percentage_rand_features, percentage_instances, i, fold_params[i]['n_edges'], fold_params[i]['n_levels'], fold_params[i]['n_weights'], fold_params[i]['n_params'], fold_params[i]['n_leaves'], fold_params[i]['n_sums'], fold_params[i]['n_prods'], fold_params[i]['n_unpruned_sums'], fold_params[i]['n_unpruned_prods'], fold_params[i]['n_scopes'], fold_params[i]['time'], fold_params[i]['prod_time'], fold_params[i]['sum_time'], fold_params[i]['tot_prod_time'], fold_params[i]['tot_sum_time'], train_alpha_times[alpha][i], valid_alpha_times[alpha][i], test_alpha_times[alpha][i], train_score, valid_score, test_score ], '\t', digits=5) out_log.write(stats + '\n') out_log.flush() if args.cv is not None: valid_avg_ll = test_avg_ll if valid_avg_ll > best_avg_ll:
split_plls = rbm.score_samples(split) eval_e_t = perf_counter() split_avg_pll = numpy.mean(split_plls) logging.info('\t{} avg PLL: {} ({})'.format(SPLIT_NAMES[i], split_avg_pll, eval_e_t - eval_s_t)) fold_scores[f, i] = split_avg_pll # # writing to file a line for the grid stats = stats_format([n_hidden, l_rate, batch_size, n_iters, f, fold_scores[f, 0], fold_scores[f, 1], fold_scores[f, 2]], '\t', digits=5) out_log.write(stats + '\n') out_log.flush() # eval_s_t = perf_counter() # train_plls = rbm.score_samples(train) # eval_e_t = perf_counter() # train_avg_pll = numpy.mean(train_plls) # logging.info('\tTrain avg PLL: {} ({})'.format(train_avg_pll, # eval_e_t - eval_s_t)) # #
test_avg_ll = numpy.mean(test_a_lls) for i in range(len(fold_splits)): train_score = train_a_lls[i] if train_a_lls else NEG_INF valid_score = valid_a_lls[i] if valid_a_lls else NEG_INF test_score = test_a_lls[i] if test_a_lls else NEG_INF # # writing to file a line for the grid stats = stats_format([ g_factor, cluster_penalty, min_inst_slice, alpha, i, fold_params[i]['n_edges'], fold_params[i]['n_levels'], fold_params[i]['n_weights'], fold_params[i]['n_leaves'], fold_params[i]['n_sums'], fold_params[i]['n_prods'], fold_params[i]['n_scopes'], fold_params[i]['time'], train_alpha_times[alpha][i], valid_alpha_times[alpha][i], test_alpha_times[alpha][i], train_score, valid_score, test_score ], '\t', digits=5) out_log.write(stats + '\n') out_log.flush() if args.cv is not None: valid_avg_ll = test_avg_ll if valid_avg_ll > best_avg_ll: best_avg_ll = valid_avg_ll