Ejemplo n.º 1
0
def perform_experiment(data_file, output_file, exp):

    if exp.make_predictions:
        X, y, D, Xtest, ytest = gpml.load_mat(data_file)
        prediction_file = os.path.join(
            exp.results_dir,
            os.path.splitext(os.path.split(data_file)[-1])[0] +
            "_predictions.mat")
    else:
        X, y, D = gpml.load_mat(data_file)

    perform_kernel_search(X, y, D, data_file, output_file, exp)
    best_model = parse_results(output_file)

    if exp.make_predictions:
        print '\nMaking predictions\n'
        predictions = jc.make_predictions(X,
                                          y,
                                          Xtest,
                                          ytest,
                                          best_model,
                                          local_computation=True,
                                          max_jobs=exp.max_jobs,
                                          verbose=exp.verbose,
                                          random_seed=exp.random_seed)
        scipy.io.savemat(prediction_file, predictions, appendmat=False)

    os.system('reset')  # Stop terminal from going invisible.
Ejemplo n.º 2
0
def perform_experiment(data_file, output_file, exp):

    if exp.make_predictions:
        X, y, D, Xtest, ytest = gpml.load_mat(data_file, y_dim=1)
        prediction_file = os.path.join(
            exp.results_dir,
            os.path.splitext(os.path.split(data_file)[-1])[0] +
            "_predictions.mat")
    else:
        X, y, D = gpml.load_mat(data_file, y_dim=1)

    perform_kernel_search(X, y, D, data_file, output_file, exp)
    best_scored_kernel = parse_results(output_file)

    if exp.make_predictions:
        predictions = jc.make_predictions(
            X,
            y,
            Xtest,
            ytest,
            best_scored_kernel,
            local_computation=exp.local_computation,
            max_jobs=exp.max_jobs,
            verbose=exp.verbose,
            zero_mean=exp.zero_mean,
            random_seed=exp.random_seed)
        scipy.io.savemat(prediction_file, predictions, appendmat=False)
Ejemplo n.º 3
0
def make_all_1d_figures(folder, save_folder='../figures/decomposition/', max_level=None, prefix='', rescale=True, data_folder=None):
    """Crawls the results directory, and makes decomposition plots for each file.
    
    prefix is an optional string prepended to the output directory
    """
    #### Quick fix to axis scaling
    #### TODO - Ultimately this and the shunt below should be removed / made elegant
    if rescale:
        data_sets = list(exp.gen_all_datasets("../data/1d_data_rescaled/"))
    else:
        if data_folder is None:
            data_sets = list(exp.gen_all_datasets("../data/1d_data/"))
        else:
            data_sets = list(exp.gen_all_datasets(data_folder))
    for r, file in data_sets:
        results_file = os.path.join(folder, file + "_result.txt")
        # Is the experiment complete
        if os.path.isfile(results_file):
            # Find best kernel and produce plots
            datafile = os.path.join(r,file + ".mat")
            X, y, D = gpml.load_mat(datafile)
            if rescale:
                # Load unscaled data to remove scaling later
                unscaled_file = os.path.join('../data/1d_data/', re.sub('-s$', '', file) + '.mat')
                data = gpml.load_mat(unscaled_file)
                (X_unscaled, y_unscaled) = (data[0], data[1])
                (X_mean, X_scale) = (X_unscaled.mean(), X_unscaled.std())
                (y_mean, y_scale) = (y_unscaled.mean(), y_unscaled.std())
            else:
                (X_mean, X_scale, y_mean, y_scale) = (0,1,0,1)
                
            # A shunt to deal with a legacy issue.
            if datafile == '../data/1d_data/01-airline-months.mat':
                # Scaling should turn months starting at zero into years starting at 1949
                print "Special rescaling for airline months data"
                X_mean = X_mean + 1949
                X_scale = 1.0/12.0
                                
            best_kernel = exp.parse_results(os.path.join(folder, file + "_result.txt"), max_level=max_level)
            stripped_kernel = fk.strip_masks(best_kernel.k_opt)
            if not max_level is None:
                fig_folder = os.path.join(save_folder, (prefix + file + '_max_level_%d' % max_level))
            else:
                fig_folder = os.path.join(save_folder, (prefix + file))
            if not os.path.exists(fig_folder):
                os.makedirs(fig_folder)
            gpml.plot_decomposition(stripped_kernel, X, y, os.path.join(fig_folder, file), best_kernel.noise, X_mean, X_scale, y_mean, y_scale)
        else:
            print "Cannnot find file %s" % results_file
Ejemplo n.º 4
0
def repeat_predictions(filename):
    """
    A convenience function to re run the predictions from an experiment
    """ 

    expstring = open(filename, 'r').read()
    exp = eval(expstring)
    print experiment_fields_to_str(exp)

    if not exp.make_predictions:
        print 'This experiment does not make predictions'
        return None
    
    data_sets = list(gen_all_datasets(exp.data_dir))

    for r, file in data_sets:
        # Check if this experiment has already been done.
        output_file = os.path.join(exp.results_dir, file + "_result.txt")
        if os.path.isfile(output_file):
            print 'Predictions for %s' % file
            data_file = os.path.join(r, file + ".mat")

            X, y, D, Xtest, ytest = gpml.load_mat(data_file)
            prediction_file = os.path.join(exp.results_dir, os.path.splitext(os.path.split(data_file)[-1])[0] + "_predictions.mat")
            best_model = parse_results(output_file)
            predictions = jc.make_predictions(X, y, Xtest, ytest, best_model, local_computation=True,
                                              max_jobs=exp.max_jobs, verbose=exp.verbose, random_seed=exp.random_seed)
            scipy.io.savemat(prediction_file, predictions, appendmat=False)

            print "Finished file %s" % file
        else:
            print 'Results not found for %s' % file
Ejemplo n.º 5
0
def debug_laplace():
    # Load data set
    X, y, D, Xtest, ytest = gpml.load_mat(
        '../data/kfold_data/r_concrete_500_fold_10_of_10.mat', y_dim=1)
    # Load the suspicious kernel
    sk = fk.repr_string_to_kernel(
        'ScoredKernel(k_opt=ProductKernel([ MaskKernel(ndim=8, active_dimension=0, base_kernel=CubicKernel(offset=1.757755, output_variance=7.084045)), MaskKernel(ndim=8, active_dimension=7, base_kernel=SqExpPeriodicKernel(lengthscale=-2.701080, period=-0.380918, output_variance=-0.071214)) ]), nll=6348.096611, laplace_nle=-184450132.068237, bic_nle=12720.630212, noise=[-1.77276072])'
    )
    # Create some code to evaluate it
    if X.ndim == 1: X = X[:, nax]
    if y.ndim == 1: y = y[:, nax]
    ndata = y.shape[0]

    # Create data file
    data_file = cblparallel.create_temp_file('.mat')
    scipy.io.savemat(data_file, {'X': X, 'y': y})  # Save regression data

    # Move to fear
    cblparallel.copy_to_remote(data_file)
    scripts = [
        gpml.OPTIMIZE_KERNEL_CODE % {
            'datafile':
            data_file.split('/')[-1],
            'writefile':
            '%(output_file)s',  # N.B. cblparallel manages output files
            'gpml_path':
            cblparallel.gpml_path(local_computation=False),
            'kernel_family':
            sk.k_opt.gpml_kernel_expression(),
            'kernel_params':
            '[ %s ]' % ' '.join(str(p) for p in sk.k_opt.param_vector()),
            'noise':
            str(sk.noise),
            'iters':
            str(300)
        }
    ]
    #### Need to be careful with % signs
    #### For the moment, cblparallel expects no single % signs - FIXME
    scripts[0] = re.sub('% ', '%% ', scripts[0])

    # Test

    scripts[0] = re.sub('delta = 1e-6', 'delta = 1e-6', scripts[0])
    #scripts[0] = re.sub('hyp.lik = [-1.77276072]', 'hyp.lik = [-0.77276072]', scripts[0])

    output_file = cblparallel.run_batch_on_fear(scripts,
                                                language='matlab',
                                                max_jobs=600)[0]

    # Read in results
    output = gpml.read_outputs(output_file)
    result = ScoredKernel.from_matlab_output(output, sk.k_opt.family(), ndata)
    print result
    print output.hessian

    os.remove(output_file)
    # Remove temporary data file (perhaps on the cluster server)
    cblparallel.remove_temp_file(data_file, local_computation=False)
Ejemplo n.º 6
0
def perform_experiment(data_file, output_file, exp):
    
    if exp.make_predictions:        
        X, y, D, Xtest, ytest = gpml.load_mat(data_file, y_dim=1)
        prediction_file = os.path.join(exp.results_dir, os.path.splitext(os.path.split(data_file)[-1])[0] + "_predictions.mat")
    else:
        X, y, D = gpml.load_mat(data_file, y_dim=1)
        
    perform_kernel_search(X, y, D, data_file, output_file, exp)
    best_scored_kernel = parse_results(output_file)
    
    if exp.make_predictions:
        predictions = jc.make_predictions(X, y, Xtest, ytest, best_scored_kernel, local_computation=exp.local_computation,
                                          max_jobs=exp.max_jobs, verbose=exp.verbose, zero_mean=exp.zero_mean, random_seed=exp.random_seed)
        scipy.io.savemat(prediction_file, predictions, appendmat=False)
        
    os.system('reset')  # Stop terminal from going invisible.
Ejemplo n.º 7
0
def calculate_model_fits(data_file, output_file, exp):
         
    prediction_file = os.path.join(exp.results_dir, os.path.splitext(os.path.split(data_file)[-1])[0] + "_predictions.mat")
    X, y, D, = gpml.load_mat(data_file, y_dim=1)
    Xtest = X
    ytest = y
        
    best_scored_kernel = parse_results(output_file)
    
    predictions = jc.make_predictions(X, y, Xtest, ytest, best_scored_kernel, local_computation=exp.local_computation,
                                      max_jobs=exp.max_jobs, verbose=exp.verbose, zero_mean=exp.zero_mean, random_seed=exp.random_seed)
    scipy.io.savemat(prediction_file, predictions, appendmat=False)
        
    os.system('reset')  # Stop terminal from going invisible.
Ejemplo n.º 8
0
def debug_laplace():
    # Load data set
    X, y, D, Xtest, ytest = gpml.load_mat('../data/kfold_data/r_concrete_500_fold_10_of_10.mat', y_dim=1)
    # Load the suspicious kernel
    sk = fk.repr_string_to_kernel('ScoredKernel(k_opt=ProductKernel([ MaskKernel(ndim=8, active_dimension=0, base_kernel=CubicKernel(offset=1.757755, output_variance=7.084045)), MaskKernel(ndim=8, active_dimension=7, base_kernel=SqExpPeriodicKernel(lengthscale=-2.701080, period=-0.380918, output_variance=-0.071214)) ]), nll=6348.096611, laplace_nle=-184450132.068237, bic_nle=12720.630212, noise=[-1.77276072])')
    # Create some code to evaluate it
    if X.ndim == 1: X = X[:, nax]
    if y.ndim == 1: y = y[:, nax]
    ndata = y.shape[0]  
    
    
    # Create data file
    data_file = cblparallel.create_temp_file('.mat')
    scipy.io.savemat(data_file, {'X': X, 'y': y}) # Save regression data
    
    # Move to fear
    cblparallel.copy_to_remote(data_file)
    scripts = [gpml.OPTIMIZE_KERNEL_CODE % {'datafile': data_file.split('/')[-1],
                                              'writefile': '%(output_file)s', # N.B. cblparallel manages output files
                                              'gpml_path': cblparallel.gpml_path(local_computation=False),
                                              'kernel_family': sk.k_opt.gpml_kernel_expression(),
                                              'kernel_params': '[ %s ]' % ' '.join(str(p) for p in sk.k_opt.param_vector()),
                                              'noise': str(sk.noise),
                                              'iters': str(300)}]
    #### Need to be careful with % signs
    #### For the moment, cblparallel expects no single % signs - FIXME
    scripts[0] = re.sub('% ', '%% ', scripts[0])
    
    # Test
    
    scripts[0] = re.sub('delta = 1e-6', 'delta = 1e-6', scripts[0])
    #scripts[0] = re.sub('hyp.lik = [-1.77276072]', 'hyp.lik = [-0.77276072]', scripts[0])
    
    output_file = cblparallel.run_batch_on_fear(scripts, language='matlab', max_jobs=600)[0]  
    
    # Read in results
    output = gpml.read_outputs(output_file)
    result = ScoredKernel.from_matlab_output(output, sk.k_opt.family(), ndata)
    print result
    print output.hessian
    
    os.remove(output_file)
    # Remove temporary data file (perhaps on the cluster server)
    cblparallel.remove_temp_file(data_file, local_computation=False)
Ejemplo n.º 9
0
def repeat_predictions(filename):
    """
    A convenience function to re run the predictions from an experiment
    """

    expstring = open(filename, 'r').read()
    exp = eval(expstring)
    print experiment_fields_to_str(exp)

    if not exp.make_predictions:
        print 'This experiment does not make predictions'
        return None

    data_sets = list(gen_all_datasets(exp.data_dir))

    for r, file in data_sets:
        # Check if this experiment has already been done.
        output_file = os.path.join(exp.results_dir, file + "_result.txt")
        if os.path.isfile(output_file):
            print 'Predictions for %s' % file
            data_file = os.path.join(r, file + ".mat")

            X, y, D, Xtest, ytest = gpml.load_mat(data_file)
            prediction_file = os.path.join(
                exp.results_dir,
                os.path.splitext(os.path.split(data_file)[-1])[0] +
                "_predictions.mat")
            best_model = parse_results(output_file)
            predictions = jc.make_predictions(X,
                                              y,
                                              Xtest,
                                              ytest,
                                              best_model,
                                              local_computation=True,
                                              max_jobs=exp.max_jobs,
                                              verbose=exp.verbose,
                                              random_seed=exp.random_seed)
            scipy.io.savemat(prediction_file, predictions, appendmat=False)

            print "Finished file %s" % file
        else:
            print 'Results not found for %s' % file
Ejemplo n.º 10
0
def perform_experiment(data_file, output_file, exp):
    
    if exp.make_predictions:        
        X, y, D, Xtest, ytest = gpml.load_mat(data_file)
        prediction_file = os.path.join(exp.results_dir, os.path.splitext(os.path.split(data_file)[-1])[0] + "_predictions.mat")
    else:
        X, y, D = gpml.my_load_mat(data_file)

    import time
    start_time = time.time()
    perform_kernel_search(X, y, D, data_file, output_file, exp)
    elapse_time = time.time() - start_time
    print ('Elapsed time: {}'.format(elapse_time))

    best_model = parse_results(output_file)
    
    if exp.make_predictions:
        print '\nMaking predictions\n'
        predictions = jc.make_predictions(X, y, Xtest, ytest, best_model, local_computation=True,
                                          max_jobs=exp.max_jobs, verbose=exp.verbose, random_seed=exp.random_seed)
        scipy.io.savemat(prediction_file, predictions, appendmat=False)
        
    os.system('reset')  # Stop terminal from going invisible.   
Ejemplo n.º 11
0
def make_all_1d_figures(folders, save_folder='../figures/decomposition/', prefix='', rescale=False, data_folder=None, skip_kernel_evaluation=False, unit='year', all_depths=False):
    """Crawls the results directory, and makes decomposition plots for each file.
    
    prefix is an optional string prepended to the output directory
    """    
    
    if not isinstance(folders, list):
        folders = [folders] # Backward compatibility with specifying one folder
    #### Quick fix to axis scaling
    #### TODO - Ultimately this and the shunt below should be removed / made elegant
    if rescale:
        data_sets = list(exp.gen_all_datasets("../data/1d_data_rescaled/"))
    else:
        if data_folder is None:
            data_sets = list(exp.gen_all_datasets("../data/1d_data/"))
        else:
            data_sets = list(exp.gen_all_datasets(data_folder))
    for r, file in data_sets:
        results_files = []
        for folder in folders:
            results_file = os.path.join(folder, file + "_result.txt")
            if os.path.isfile(results_file):
                results_files.append(results_file)
        # Is the experiment complete
        if len(results_files) > 0:
            # Find best kernel and produce plots
            datafile = os.path.join(r,file + ".mat")
            data = gpml.load_mat(datafile)
            X = data[0]
            y = data[1]
            D = data[2]
            assert D == 1
            if rescale:
                # Load unscaled data to remove scaling later
                unscaled_file = os.path.join('../data/1d_data/', re.sub('-s$', '', file) + '.mat')
                data = gpml.load_mat(unscaled_file)
                (X_unscaled, y_unscaled) = (data[0], data[1])
                (X_mean, X_scale) = (X_unscaled.mean(), X_unscaled.std())
                (y_mean, y_scale) = (y_unscaled.mean(), y_unscaled.std())
            else:
                (X_mean, X_scale, y_mean, y_scale) = (0,1,0,1)
                                
            if all_depths:
                # A quick version for now TODO - write correct code
                models = [exp.parse_results(results_files, max_level=depth) for depth in range(10)]
                suffices = ['-depth-%d' % (depth+1) for depth in range(len(models))]
            else:
                models = [exp.parse_results(results_files)]
                suffices = ['']

            for (model, suffix) in zip(models, suffices):
                model = model.simplified().canonical()
                kernel_components = model.kernel.break_into_summands()
                kernel_components = ff.SumKernel(kernel_components).simplified().canonical().operands
                print model.pretty_print()
                fig_folder = os.path.join(save_folder, (prefix + file + suffix))
                if not os.path.exists(fig_folder):
                    os.makedirs(fig_folder)
                # First ask GPML to order the components
                print 'Determining order of components'
                (component_order, mae_data) = gpml.order_by_mae(model, kernel_components, X, y, D, os.path.join(fig_folder, file + suffix), skip_kernel_evaluation=skip_kernel_evaluation)
                print 'Plotting decomposition and computing basic stats'
                component_data = gpml.component_stats(model, kernel_components, X, y, D, os.path.join(fig_folder, file + suffix), component_order, skip_kernel_evaluation=skip_kernel_evaluation)
                print 'Computing model checking stats'
                checking_stats = gpml.checking_stats(model, kernel_components, X, y, D, os.path.join(fig_folder, file + suffix), component_order, make_plots=True, skip_kernel_evaluation=skip_kernel_evaluation)
                # Now the kernels have been evaluated we can translate the revelant ones
                evaluation_data = mae_data
                evaluation_data.update(component_data)
                evaluation_data.update(checking_stats)
                evaluation_data['vars'] = evaluation_data['vars'].ravel()
                evaluation_data['cum_vars'] = evaluation_data['cum_vars'].ravel()
                evaluation_data['cum_resid_vars'] = evaluation_data['cum_resid_vars'].ravel()
                evaluation_data['MAEs'] = evaluation_data['MAEs'].ravel()
                evaluation_data['MAE_reductions'] = evaluation_data['MAE_reductions'].ravel()
                evaluation_data['monotonic'] = evaluation_data['monotonic'].ravel()
                evaluation_data['acf_min_p'] = evaluation_data['acf_min_p'].ravel()
                evaluation_data['acf_min_loc_p'] = evaluation_data['acf_min_loc_p'].ravel()
                evaluation_data['pxx_max_p'] = evaluation_data['pxx_max_p'].ravel()
                evaluation_data['pxx_max_loc_p'] = evaluation_data['pxx_max_loc_p'].ravel()
                evaluation_data['qq_d_max_p'] = evaluation_data['qq_d_max_p'].ravel()
                evaluation_data['qq_d_min_p'] = evaluation_data['qq_d_min_p'].ravel()
                i = 1
                short_descriptions = []
                while os.path.isfile(os.path.join(fig_folder, '%s_%d.fig' % (file + suffix, i))):
                    # Describe this component
                    (summary, sentences, extrap_sentences) = translation.translate_additive_component(kernel_components[component_order[i-1]], X, evaluation_data['monotonic'][i-1], evaluation_data['gradients'][i-1], unit)
                    short_descriptions.append(summary)
                    paragraph = '.\n'.join(sentences) + '.'
                    extrap_paragraph = '.\n'.join(extrap_sentences) + '.'
                    with open(os.path.join(fig_folder, '%s_%d_description.tex' % (file + suffix, i)), 'w') as description_file:
                        description_file.write(paragraph)
                    with open(os.path.join(fig_folder, '%s_%d_extrap_description.tex' % (file + suffix, i)), 'w') as description_file:
                        description_file.write(extrap_paragraph)
                    with open(os.path.join(fig_folder, '%s_%d_short_description.tex' % (file + suffix, i)), 'w') as description_file:
                        description_file.write(summary + '.')
                    i += 1
                # Produce the summary LaTeX document
                print 'Producing LaTeX document'
                latex_summary = translation.produce_summary_document(file + suffix, i-1, evaluation_data, short_descriptions)
                with open(os.path.join(save_folder, '%s.tex' % (file + suffix)), 'w') as latex_file:
                    latex_file.write(latex_summary)
                print 'Saving to ' + (os.path.join(save_folder, '%s.tex' % (file + suffix)))
        else:
            print "Cannnot find results for %s" % file
Ejemplo n.º 12
0
def make_all_1d_figures(folders,
                        save_folder='../figures/decomposition/',
                        prefix='',
                        rescale=False,
                        data_folder=None,
                        skip_kernel_evaluation=False,
                        unit='year',
                        all_depths=False):
    """Crawls the results directory, and makes decomposition plots for each file.
    
    prefix is an optional string prepended to the output directory
    """

    if not isinstance(folders, list):
        folders = [folders
                   ]  # Backward compatibility with specifying one folder
    #### Quick fix to axis scaling
    #### TODO - Ultimately this and the shunt below should be removed / made elegant
    if rescale:
        data_sets = list(exp.gen_all_datasets("../data/1d_data_rescaled/"))
    else:
        if data_folder is None:
            data_sets = list(exp.gen_all_datasets("../data/1d_data/"))
        else:
            data_sets = list(exp.gen_all_datasets(data_folder))
    for r, file in data_sets:
        results_files = []
        for folder in folders:
            results_file = os.path.join(folder, file + "_result.txt")
            if os.path.isfile(results_file):
                results_files.append(results_file)
        # Is the experiment complete
        if len(results_files) > 0:
            # Find best kernel and produce plots
            datafile = os.path.join(r, file + ".mat")
            data = gpml.my_load_mat(datafile)
            X = data[0]
            y = data[1]
            M = y.shape[1]
            D = data[2]
            iiii = 1
            y = y[:, iiii]
            assert D == 1
            if rescale:
                # Load unscaled data to remove scaling later
                unscaled_file = os.path.join('../data/1d_data/',
                                             re.sub('-s$', '', file) + '.mat')
                data = gpml.load_mat(unscaled_file)
                (X_unscaled, y_unscaled) = (data[0], data[1])
                (X_mean, X_scale) = (X_unscaled.mean(), X_unscaled.std())
                (y_mean, y_scale) = (y_unscaled.mean(), y_unscaled.std())
            else:
                (X_mean, X_scale, y_mean, y_scale) = (0, 1, 0, 1)

            if all_depths:
                # A quick version for now TODO - write correct code
                models = [
                    exp.parse_results(results_files, max_level=depth)[0]
                    for depth in range(10)
                ]
                suffices = [
                    '-depth-%d' % (depth + 1) for depth in range(len(models))
                ]
            else:
                models = [exp.parse_results(results_files)[0]]
                try:
                    suffices = ['-' + str(data[5][iiii]).replace(" ", "")]
                    #suffices = ['-'+str(data[5][iiii][0][0]).replace(" ","")] uncomment this if you test given dataset(house, stock and so on)
                except:
                    suffices = ['-' + str(iiii)]
                #suffices = ['-'+str(iiii)]
                best_depth = exp.parse_results(results_files)[1]
                params_filename = '/home/heechan/gpss-research-srkl' + results_files[
                    0][2:] + 'lvl_' + str(best_depth) + '_0.mat1.mat'
                scale_params = scipy.io.loadmat(params_filename)['scale']
                scl1 = scale_params[0][iiii][0][0]
                scl2 = scale_params[0][iiii][1][0]

            for (model, suffix) in zip(models, suffices):
                model = model.simplified().canonical()
                model.kernel = model.kernel * ff.ConstKernel(
                    sf=scl2) + ff.ConstKernel(sf=scl1)
                kernel_components = model.kernel.break_into_summands()
                kernel_components = ff.SumKernel(
                    kernel_components).simplified().canonical().operands
                print model.pretty_print()
                fig_folder = os.path.join(save_folder,
                                          (prefix + file + suffix))
                if not os.path.exists(fig_folder):
                    os.makedirs(fig_folder)
                # First ask GPML to order the components
                print 'Determining order of components'
                (component_order, mae_data) = gpml.order_by_mae(
                    model,
                    kernel_components,
                    X,
                    y,
                    D,
                    os.path.join(fig_folder, file + suffix),
                    skip_kernel_evaluation=skip_kernel_evaluation)
                print 'Plotting decomposition and computing basic stats'
                component_data = gpml.component_stats(
                    model,
                    kernel_components,
                    X,
                    y,
                    D,
                    os.path.join(fig_folder, file + suffix),
                    component_order,
                    skip_kernel_evaluation=skip_kernel_evaluation)
                print 'Computing model checking stats'
                checking_stats = gpml.checking_stats(
                    model,
                    kernel_components,
                    X,
                    y,
                    D,
                    os.path.join(fig_folder, file + suffix),
                    component_order,
                    make_plots=True,
                    skip_kernel_evaluation=skip_kernel_evaluation)
                # Now the kernels have been evaluated we can translate the revelant ones
                evaluation_data = mae_data
                evaluation_data.update(component_data)
                evaluation_data.update(checking_stats)
                evaluation_data['vars'] = evaluation_data['vars'].ravel()
                evaluation_data['cum_vars'] = evaluation_data[
                    'cum_vars'].ravel()
                evaluation_data['cum_resid_vars'] = evaluation_data[
                    'cum_resid_vars'].ravel()
                evaluation_data['MAEs'] = evaluation_data['MAEs'].ravel()
                evaluation_data['MAE_reductions'] = evaluation_data[
                    'MAE_reductions'].ravel()
                evaluation_data['monotonic'] = evaluation_data[
                    'monotonic'].ravel()
                evaluation_data['acf_min_p'] = evaluation_data[
                    'acf_min_p'].ravel()
                evaluation_data['acf_min_loc_p'] = evaluation_data[
                    'acf_min_loc_p'].ravel()
                evaluation_data['pxx_max_p'] = evaluation_data[
                    'pxx_max_p'].ravel()
                evaluation_data['pxx_max_loc_p'] = evaluation_data[
                    'pxx_max_loc_p'].ravel()
                evaluation_data['qq_d_max_p'] = evaluation_data[
                    'qq_d_max_p'].ravel()
                evaluation_data['qq_d_min_p'] = evaluation_data[
                    'qq_d_min_p'].ravel()
                i = 1
                short_descriptions = []
                while os.path.isfile(
                        os.path.join(fig_folder,
                                     '%s_%d.fig' % (file + suffix, i))):
                    # Describe this component
                    (summary, sentences, extrap_sentences
                     ) = translation.translate_additive_component(
                         kernel_components[component_order[i - 1]], X,
                         evaluation_data['monotonic'][i - 1],
                         evaluation_data['gradients'][i - 1], unit)
                    short_descriptions.append(summary)
                    paragraph = '.\n'.join(sentences) + '.'
                    extrap_paragraph = '.\n'.join(extrap_sentences) + '.'
                    with open(
                            os.path.join(
                                fig_folder,
                                '%s_%d_description.tex' % (file + suffix, i)),
                            'w') as description_file:
                        description_file.write(paragraph)
                    with open(
                            os.path.join(
                                fig_folder, '%s_%d_extrap_description.tex' %
                                (file + suffix, i)), 'w') as description_file:
                        description_file.write(extrap_paragraph)
                    with open(
                            os.path.join(
                                fig_folder, '%s_%d_short_description.tex' %
                                (file + suffix, i)), 'w') as description_file:
                        description_file.write(summary + '.')
                    i += 1
                # Produce the summary LaTeX document
                print 'Producing LaTeX document'
                latex_summary = translation.produce_summary_document(
                    file + suffix, i - 1, evaluation_data, short_descriptions)
                with open(
                        os.path.join(save_folder, '%s.tex' % (file + suffix)),
                        'w') as latex_file:
                    latex_file.write(latex_summary)
                print 'Saving to ' + (os.path.join(save_folder, '%s.tex' %
                                                   (file + suffix)))
        else:
            print "Cannnot find results for %s" % file
Ejemplo n.º 13
0
def make_all_1d_figures(folder,
                        save_folder='../figures/decomposition/',
                        max_level=None,
                        prefix='',
                        rescale=True,
                        data_folder=None):
    """Crawls the results directory, and makes decomposition plots for each file.
    
    prefix is an optional string prepended to the output directory
    """
    #### Quick fix to axis scaling
    #### TODO - Ultimately this and the shunt below should be removed / made elegant
    if rescale:
        data_sets = list(exp.gen_all_datasets("../data/1d_data_rescaled/"))
    else:
        if data_folder is None:
            data_sets = list(exp.gen_all_datasets("../data/1d_data/"))
        else:
            data_sets = list(exp.gen_all_datasets(data_folder))
    for r, file in data_sets:
        results_file = os.path.join(folder, file + "_result.txt")
        # Is the experiment complete
        if os.path.isfile(results_file):
            # Find best kernel and produce plots
            datafile = os.path.join(r, file + ".mat")
            X, y, D = gpml.load_mat(datafile)
            if rescale:
                # Load unscaled data to remove scaling later
                unscaled_file = os.path.join('../data/1d_data/',
                                             re.sub('-s$', '', file) + '.mat')
                data = gpml.load_mat(unscaled_file)
                (X_unscaled, y_unscaled) = (data[0], data[1])
                (X_mean, X_scale) = (X_unscaled.mean(), X_unscaled.std())
                (y_mean, y_scale) = (y_unscaled.mean(), y_unscaled.std())
            else:
                (X_mean, X_scale, y_mean, y_scale) = (0, 1, 0, 1)

            # A shunt to deal with a legacy issue.
            if datafile == '../data/1d_data/01-airline-months.mat':
                # Scaling should turn months starting at zero into years starting at 1949
                print "Special rescaling for airline months data"
                X_mean = X_mean + 1949
                X_scale = 1.0 / 12.0

            best_kernel = exp.parse_results(os.path.join(
                folder, file + "_result.txt"),
                                            max_level=max_level)
            stripped_kernel = fk.strip_masks(best_kernel.k_opt)
            if not max_level is None:
                fig_folder = os.path.join(
                    save_folder, (prefix + file + '_max_level_%d' % max_level))
            else:
                fig_folder = os.path.join(save_folder, (prefix + file))
            if not os.path.exists(fig_folder):
                os.makedirs(fig_folder)
            gpml.plot_decomposition(stripped_kernel, X, y,
                                    os.path.join(fig_folder,
                                                 file), best_kernel.noise,
                                    X_mean, X_scale, y_mean, y_scale)
        else:
            print "Cannnot find file %s" % results_file