Beispiel #1
0
            valid_preds = evaluate_on_dataset(spn, valid)
            assert valid_preds.shape[0] == valid.shape[0]
            valid_avg_ll = numpy.mean(valid_preds)
            logging.info('\t{}'.format(valid_avg_ll))

        if test is not None:
            logging.info('Evaluating on test set')
            test_preds = evaluate_on_dataset(spn, test)
            assert test_preds.shape[0] == test.shape[0]
            test_avg_ll = numpy.mean(test_preds)
            logging.info('\t{}'.format(test_avg_ll))

        #
        # writing to file
        stats = stats_format([train_avg_ll, valid_avg_ll, test_avg_ll],
                             '\t',
                             digits=5)
        out_log.write(stats + '\n')
        out_log.flush()

        #
        # also serializing the split predictions
        train_lls_path = os.path.join(out_path, TRAIN_PREDS_EXT)
        numpy.savetxt(train_lls_path, train_preds, delimiter='\n')

        if valid is not None:
            valid_lls_path = os.path.join(out_path, VALID_PREDS_EXT)
            numpy.savetxt(valid_lls_path, valid_preds, delimiter='\n')

        if test is not None:
            test_lls_path = os.path.join(out_path, TEST_PREDS_EXT)
Beispiel #2
0
                        #
                        # checking for improvements on validation
                        if valid_avg_pll > best_valid_avg_pll:
                            best_valid_avg_pll = valid_avg_pll
                            best_model = rbm
                            best_params['n-hidden'] = n_hidden
                            best_params['learning-rate'] = l_rate
                            best_params['batch-size'] = batch_size
                            best_params['n-iters'] = n_iters
                            best_test_plls = test_plls

                            #
                            # saving the model
                            if args.save_model:
                                prefix_str = stats_format(
                                    [n_hidden, l_rate, batch_size, n_iters],
                                    '_',
                                    digits=5)
                                model_path = os.path.join(
                                    out_path, 'best.{0}.{1}'.format(
                                        dataset_name, MODEL_EXT))
                                with open(model_path, 'wb') as model_file:
                                    pickle.dump(rbm, model_file)
                                    logging.info(
                                        'Dumped RBM to {}'.format(model_path))

                        #
                        # writing to file a line for the grid
                        stats = stats_format([
                            n_hidden, l_rate, batch_size, n_iters,
                            train_avg_pll, valid_avg_pll, test_avg_pll
                        ],
Beispiel #3
0
                        best_state['min-inst-slice'] = min_inst_slice
                        best_state['g-factor'] = g_factor
                        best_state['cluster-penalty'] = cluster_penalty
                        best_state['train_ll'] = train_avg_ll
                        best_state['valid_ll'] = valid_avg_ll
                        best_state['test_ll'] = test_avg_ll
                        best_test_lls = test_lls

                    #
                    # writing to file a line for the grid
                    stats = stats_format([g_factor,
                                          cluster_penalty,
                                          min_inst_slice,
                                          alpha,
                                          n_edges, n_levels,
                                          n_weights, n_leaves,
                                          train_avg_ll,
                                          valid_avg_ll,
                                          test_avg_ll],
                                         '\t',
                                         digits=5)
                    out_log.write(stats + '\n')
                    out_log.flush()

    #
    # writing as last line the best params
    out_log.write("{0}".format(best_state))
    out_log.flush()

    #
    # saving the best test_lls
Beispiel #4
0
							    best_state['alpha'] = alpha
							    best_state['min-inst-slice'] = min_inst_slice
							    best_state['g-factor'] = g_factor
							    best_state['cluster-penalty'] = cluster_penalty
							    best_state['train_ll'] = train_avg_ll
							    best_state['index'] = index

							index = index + 1
							#
							# writing to file a line for the grid
							stats = stats_format([g_factor,
												  cluster_penalty,
												  min_inst_slice,
												  alpha,
												  n_edges, n_levels,
												  n_weights, n_leaves,
												  train_avg_ll,
												  valid_avg_ll,
												  test_avg_ll],
												 '\t',
												 digits=5)
							out_log.write(stats + '\n')
							out_log.flush()

			#
			# writing as last line the best params
			out_log.write("{0}".format(best_state))
			out_log.flush()

			#
			
Beispiel #5
0
        for i in range(len(fold_splits)):
            train_score = train_a_lls[i] if train_a_lls else NEG_INF
            valid_score = valid_a_lls[i] if valid_a_lls else NEG_INF
            test_score = test_a_lls[i] if test_a_lls else NEG_INF
            #
            # writing to file a line for the grid
            stats = stats_format([
                g_factor, cluster_penalty, min_inst_slice, alpha,
                entropy_threshold, percentage_rand_features,
                percentage_instances, i, fold_params[i]['n_edges'],
                fold_params[i]['n_levels'], fold_params[i]['n_weights'],
                fold_params[i]['n_params'], fold_params[i]['n_leaves'],
                fold_params[i]['n_sums'], fold_params[i]['n_prods'],
                fold_params[i]['n_unpruned_sums'],
                fold_params[i]['n_unpruned_prods'], fold_params[i]['n_scopes'],
                fold_params[i]['time'], fold_params[i]['prod_time'],
                fold_params[i]['sum_time'], fold_params[i]['tot_prod_time'],
                fold_params[i]['tot_sum_time'], train_alpha_times[alpha][i],
                valid_alpha_times[alpha][i], test_alpha_times[alpha][i],
                train_score, valid_score, test_score
            ],
                                 '\t',
                                 digits=5)
            out_log.write(stats + '\n')
            out_log.flush()

        if args.cv is not None:
            valid_avg_ll = test_avg_ll

        if valid_avg_ll > best_avg_ll:
Beispiel #6
0
                                    split_plls = rbm.score_samples(split)
                                    eval_e_t = perf_counter()
                                    split_avg_pll = numpy.mean(split_plls)
                                    logging.info('\t{} avg PLL: {} ({})'.format(SPLIT_NAMES[i],
                                                                                split_avg_pll,
                                                                                eval_e_t - eval_s_t))

                                    fold_scores[f, i] = split_avg_pll

                            #
                            # writing to file a line for the grid
                            stats = stats_format([n_hidden,
                                                  l_rate,
                                                  batch_size,
                                                  n_iters,
                                                  f,
                                                  fold_scores[f, 0],
                                                  fold_scores[f, 1],
                                                  fold_scores[f, 2]],
                                                 '\t',
                                                 digits=5)
                            out_log.write(stats + '\n')
                            out_log.flush()

                            # eval_s_t = perf_counter()
                            # train_plls = rbm.score_samples(train)
                            # eval_e_t = perf_counter()
                            # train_avg_pll = numpy.mean(train_plls)
                            # logging.info('\tTrain avg PLL: {} ({})'.format(train_avg_pll,
                            #                                                eval_e_t - eval_s_t))

                            # #
Beispiel #7
0
                    test_avg_ll = numpy.mean(test_a_lls)

                    for i in range(len(fold_splits)):
                        train_score = train_a_lls[i] if train_a_lls else NEG_INF
                        valid_score = valid_a_lls[i] if valid_a_lls else NEG_INF
                        test_score = test_a_lls[i] if test_a_lls else NEG_INF
                        #
                        # writing to file a line for the grid
                        stats = stats_format([
                            g_factor, cluster_penalty, min_inst_slice, alpha,
                            i, fold_params[i]['n_edges'],
                            fold_params[i]['n_levels'],
                            fold_params[i]['n_weights'],
                            fold_params[i]['n_leaves'],
                            fold_params[i]['n_sums'],
                            fold_params[i]['n_prods'],
                            fold_params[i]['n_scopes'], fold_params[i]['time'],
                            train_alpha_times[alpha][i],
                            valid_alpha_times[alpha][i],
                            test_alpha_times[alpha][i], train_score,
                            valid_score, test_score
                        ],
                                             '\t',
                                             digits=5)
                        out_log.write(stats + '\n')
                        out_log.flush()

                    if args.cv is not None:
                        valid_avg_ll = test_avg_ll

                    if valid_avg_ll > best_avg_ll:
                        best_avg_ll = valid_avg_ll