def make_predictions(X, y, Xtest, ytest, best_scored_kernel, local_computation=False, max_jobs=500, verbose=True, zero_mean=False, random_seed=0):
    '''
    Evaluates a kernel on held out data
    Input:
     - X                  - A matrix (data_points x dimensions) of input locations
     - y                  - A matrix (data_points x 1) of output values
     - Xtest              - Held out X data
     - ytest              - Held out y data
     - best_scored_kernel - A Scored Kernel object to be evaluated on the held out data
     - ...
    Return:
     - A dictionary of results from the MATLAB script containing:
       - loglik - an array of log likelihoods of test data
       - predictions - an array of mean predictions for the held out data
       - actuals - ytest
       - model - I'm not sure FIXME
       - timestamp - A time stamp of some sort
    '''
    # Make data into matrices in case they're unidimensional.
    if X.ndim == 1: X = X[:, nax]
    if y.ndim == 1: y = y[:, nax]
    ndata = y.shape[0]
    # Save temporary data file in standard temporary directory
    data_file = cblparallel.create_temp_file('.mat')
    scipy.io.savemat(data_file, {'X': X, 'y': y, 'Xtest' : Xtest, 'ytest' : ytest})
    # Copy onto cluster server if necessary
    if not local_computation:
        if verbose:
            print 'Moving data file to fear'
        cblparallel.copy_to_remote(data_file)
    # Create prediction code
    parameters ={'datafile': data_file.split('/')[-1],
                 'writefile': '%(output_file)s',
                 'gpml_path': cblparallel.gpml_path(local_computation),
                 'kernel_family': best_scored_kernel.k_opt.gpml_kernel_expression(),
                 'kernel_params': '[ %s ]' % ' '.join(str(p) for p in best_scored_kernel.k_opt.param_vector()),
                 'noise': str(best_scored_kernel.noise),
                 'iters': str(30),
                 'seed': str(random_seed)}
    if zero_mean:
        code = gpml.PREDICT_AND_SAVE_CODE_ZERO_MEAN % parameters
    else:
        code = gpml.PREDICT_AND_SAVE_CODE % parameters
    code = re.sub('% ', '%% ', code) # HACK - cblparallel currently does not like % signs
    # Evaluate code - potentially on cluster
    if local_computation:   
        temp_results_file = cblparallel.run_batch_locally([code], language='matlab', max_cpu=1.1, max_mem=1.1, verbose=verbose)[0]
    else:
        temp_results_file = cblparallel.run_batch_on_fear([code], language='matlab', max_jobs=max_jobs, verbose=verbose)[0]
    results = scipy.io.loadmat(temp_results_file)
    # Remove temporary files (perhaps on the cluster server)
    cblparallel.remove_temp_file(temp_results_file, local_computation)
    cblparallel.remove_temp_file(data_file, local_computation)
    # Return dictionary of MATLAB results
    return results
def make_predictions(
    X, y, Xtest, ytest, model, local_computation=False, max_jobs=500, verbose=True, random_seed=0, no_noise=False
):
    # Make data into matrices in case they're unidimensional.
    if X.ndim == 1:
        X = X[:, nax]
    if y.ndim == 1:
        y = y[:, nax]
    ndata = y.shape[0]
    # Save temporary data file in standard temporary directory
    data_file = cblparallel.create_temp_file(".mat")
    scipy.io.savemat(data_file, {"X": X, "y": y, "Xtest": Xtest, "ytest": ytest})
    # Copy onto cluster server if necessary
    if not local_computation:
        if verbose:
            print "Moving data file to fear"
        cblparallel.copy_to_remote(data_file)
    # Create prediction code
    parameters = {
        "datafile": data_file.split("/")[-1],
        "writefile": "%(output_file)s",
        "gpml_path": cblparallel.gpml_path(local_computation),
        "mean_syntax": model.mean.get_gpml_expression(dimensions=X.shape[1]),
        "mean_params": "[ %s ]" % " ".join(str(p) for p in model.mean.param_vector),
        "kernel_syntax": model.kernel.get_gpml_expression(dimensions=X.shape[1]),
        "kernel_params": "[ %s ]" % " ".join(str(p) for p in model.kernel.param_vector),
        "lik_syntax": model.likelihood.get_gpml_expression(dimensions=X.shape[1]),
        "lik_params": "[ %s ]" % " ".join(str(p) for p in model.likelihood.param_vector),
        "inference": model.likelihood.gpml_inference_method,
        "iters": str(30),
        "seed": str(random_seed),
    }
    code = gpml.PREDICT_AND_SAVE_CODE % parameters
    code = re.sub("% ", "%% ", code)  # HACK - cblparallel currently does not like % signs
    # Evaluate code - potentially on cluster
    if local_computation:
        temp_results_file = cblparallel.run_batch_locally(
            [code], language="matlab", max_cpu=1.1, max_mem=1.1, verbose=verbose
        )[0]
    else:
        temp_results_file = cblparallel.run_batch_on_fear(
            [code], language="matlab", max_jobs=max_jobs, verbose=verbose
        )[0]
    results = scipy.io.loadmat(temp_results_file)
    # Remove temporary files (perhaps on the cluster server)
    cblparallel.remove_temp_file(temp_results_file, local_computation)
    cblparallel.remove_temp_file(data_file, local_computation)
    # Return dictionary of MATLAB results
    return results
def local_matlab_test(n=10):
    # Prepare code
    scripts = [matlab_code] * n
    # Run bacth in parallel
    output_files = cblparallel.run_batch_locally(scripts, language='matlab')  
    # Now do something with the output
    estimators = []

    for output_file in output_files:
        with open(output_file, 'r') as f:
            estimator = np.genfromtxt(output_file, delimiter=',')
        os.remove(output_file)
        estimators.append(estimator)

    ens_pred = np.mean(estimators)
    return ens_pred
def covariance_distance(kernels, X, local_computation=True, verbose=True): 
    '''
    Evaluate a distance matrix of kernels, in terms of their covariance matrix evaluated on training inputs
    Input:
     - kernels           - A list of fk.ScoredKernel
     - X                 - A matrix (data_points x dimensions) of input locations
     - local_computation - Boolean indicating if computation should be performed on cluster or on local machine
    Return:
     - A matrix of similarities between the input kernels
    '''
    assert(len(kernels) > 0) #### FIXME - This sort of check should happen earlier
    # Make data into matrices in case they're unidimensional.
    if X.ndim == 1: X = X[:, nax]
    # Save temporary data file in standard temporary directory
    data_file = cblparallel.create_temp_file('.mat')
    scipy.io.savemat(data_file, {'X': X})
    # Copy onto cluster server if necessary
    if not local_computation:
        if verbose:
            print 'Moving data file to fear'
        cblparallel.copy_to_remote(data_file)
    # Construct testing code
    code = gpml.DISTANCE_CODE_HEADER % {'datafile': data_file.split('/')[-1],
                                          'gpml_path': cblparallel.gpml_path(local_computation)}
    for (i, kernel) in enumerate([k.k_opt for k in kernels]):
        code = code + gpml.DISTANCE_CODE_COV % {'iter' : i + 1,
                                                  'kernel_family': kernel.gpml_kernel_expression(),
                                                  'kernel_params': '[ %s ]' % ' '.join(str(p) for p in kernel.param_vector())}
    code = code + gpml.DISTANCE_CODE_FOOTER_HIGH_MEM % {'writefile': '%(output_file)s'} # N.B. cblparallel manages output files
    code = re.sub('% ', '%% ', code) # HACK - cblparallel not fond of % signs at the moment
    # Run code - either locally or on cluster - returning location of output file
    if local_computation:
        output_file = cblparallel.run_batch_locally([code], language='matlab', max_cpu=1.1, max_mem=1.1, job_check_sleep=30, verbose=verbose, single_thread=False)[0] 
    else:
        output_file = cblparallel.run_batch_on_fear([code], language='matlab', max_jobs=500, verbose=verbose)[0]
    # Read in results from experiment
    gpml_result = scipy.io.loadmat(output_file)
    distance = gpml_result['sim_matrix']
    # Remove temporary files (perhaps on the cluster server)
    cblparallel.remove_temp_file(output_file, local_computation)
    cblparallel.remove_temp_file(data_file, local_computation)
    # Return distance matrix
    return distance
def local_forest_test(n=10,n_trees=10):
    # Data
    X, y = make_friedman1(n_samples=1200, random_state=0, noise=1.0)
    X_train, X_test = X[:200], X[200:]
    y_train, y_test = y[:200], y[200:]
    # Params
#    local_temp_path = os.path.abspath('../temp/')
#    remote_temp_path = 'python/'
    # Write data file locally
    data_file = mkstemp_safe(cblparallel.config.HOME_TEMP_PATH, '.p')
    with open(data_file, 'w') as f:
        pickle.dump((X_train, y_train, X_test), f)
    # Prepare code
    scripts = [reduced_tree_code % {'data_file' : data_file,
                            'n_trees' : n_trees,
                            'random_state' : i * n_trees,
                            'output_file' : '%(output_file)s',
                            'flag_file' : '%(flag_file)s'} for i in range(n)]
    # Run bacth in parallel)
    output_files = cblparallel.run_batch_locally(scripts)

    # Kill local data file
    os.remove(data_file)    

    # Now do something with the output

    estimators = []
    predictions = []

    for output_file in output_files:
        with open(output_file, 'r') as f:
            #(estimator, prediction) = pickle.load(f)
            prediction = np.genfromtxt(output_file, delimiter=',')
        os.remove(output_file)
        #estimators.append(estimator)
        predictions.append(prediction)

    #ens = EnsembleRegressor(estimators)
    #return RMSE(X_test, y_test, ens)

    ens_pred = np.mean(predictions, axis=0)
    return RMSE_y(y_test, ens_pred)
Exemple #6
0
def make_predictions(X, y, Xtest, ytest, model, local_computation=False, max_jobs=500, verbose=True, random_seed=0, no_noise=False):
    # Make data into matrices in case they're unidimensional.
    if X.ndim == 1: X = X[:, nax]
    if y.ndim == 1: y = y[:, nax]
    ndata = y.shape[0]
    # Save temporary data file in standard temporary directory
    data_file = cblparallel.create_temp_file('.mat')
    scipy.io.savemat(data_file, {'X': X, 'y': y, 'Xtest' : Xtest, 'ytest' : ytest})
    # Copy onto cluster server if necessary
    if not local_computation:
        if verbose:
            print 'Moving data file to fear'
        cblparallel.copy_to_remote(data_file)
    # Create prediction code
    parameters ={'datafile': data_file.split('/')[-1],
                 'writefile': '%(output_file)s',
                 'gpml_path': cblparallel.gpml_path(local_computation),
                 'mean_syntax': model.mean.get_gpml_expression(dimensions=X.shape[1]),
                 'mean_params': '[ %s ]' % ' '.join(str(p) for p in model.mean.param_vector),
                 'kernel_syntax': model.kernel.get_gpml_expression(dimensions=X.shape[1]),
                 'kernel_params': '[ %s ]' % ' '.join(str(p) for p in model.kernel.param_vector),
                 'lik_syntax': model.likelihood.get_gpml_expression(dimensions=X.shape[1]),
                 'lik_params': '[ %s ]' % ' '.join(str(p) for p in model.likelihood.param_vector),
                 'inference': model.likelihood.gpml_inference_method,
                 'iters': str(30),
                 'seed': str(random_seed)}
    code = gpml.PREDICT_AND_SAVE_CODE % parameters
    code = re.sub('% ', '%% ', code) # HACK - cblparallel currently does not like % signs
    # Evaluate code - potentially on cluster
    if local_computation:   
        temp_results_file = cblparallel.run_batch_locally([code], language='matlab', max_cpu=1.1, max_mem=1.1, verbose=verbose)[0]
    else:
        temp_results_file = cblparallel.run_batch_on_fear([code], language='matlab', max_jobs=max_jobs, verbose=verbose)[0]
    results = scipy.io.loadmat(temp_results_file)
    # Remove temporary files (perhaps on the cluster server)
    cblparallel.remove_temp_file(temp_results_file, local_computation)
    cblparallel.remove_temp_file(data_file, local_computation)
    # Return dictionary of MATLAB results
    return results
def evaluate_models(models,
                    X,
                    y,
                    verbose=True,
                    iters=300,
                    local_computation=False,
                    zip_files=False,
                    max_jobs=500,
                    random_seed=0,
                    subset=False,
                    subset_size=250,
                    full_iters=0,
                    bundle_size=1):

    # Make data into matrices in case they're unidimensional.
    if X.ndim == 1: X = X[:, nax]
    if y.ndim == 1: y = y[:, nax]
    ndata = y.shape[0]

    # Create data file
    if verbose:
        print 'Creating data file locally'
    data_file = cblparallel.create_temp_file('.mat')

    scipy.io.savemat(data_file, {'X': X, 'y': y})

    # Move to fear if necessary
    if not local_computation:
        if verbose:
            print 'Moving data file to fear'
        cblparallel.copy_to_remote(data_file)

    # Create a list of MATLAB scripts to assess and optimise parameters for each kernel
    if verbose:
        print 'Creating scripts'
    scripts = [None] * len(models)
    for (i, model) in enumerate(models):
        parameters = {
            'datafile':
            data_file.split('/')[-1],
            'writefile':
            '%(output_file)s',  # N.B. cblparallel manages output files
            'gpml_path':
            cblparallel.gpml_path(local_computation),
            'mean_syntax':
            model.mean.get_gpml_expression(dimensions=X.shape[1]),
            'mean_params':
            '[ %s ]' % ' '.join(str(p) for p in model.mean.param_vector),
            'kernel_syntax':
            model.kernel.get_gpml_expression(dimensions=X.shape[1]),
            'kernel_params':
            '[ %s ]' % ' '.join(str(p) for p in model.kernel.param_vector),
            'lik_syntax':
            model.likelihood.get_gpml_expression(dimensions=X.shape[1]),
            'lik_params':
            '[ %s ]' % ' '.join(str(p) for p in model.likelihood.param_vector),
            'inference':
            model.likelihood.gpml_inference_method,
            'iters':
            str(iters),
            'seed':
            str(np.random.randint(2**31)),
            'subset':
            'true' if subset else 'false',
            'subset_size':
            str(subset_size),
            'full_iters':
            str(full_iters)
        }

        scripts[i] = gpml.OPTIMIZE_KERNEL_CODE % parameters
        #### Need to be careful with % signs
        #### For the moment, cblparallel expects no single % signs - FIXME
        scripts[i] = re.sub('% ', '%% ', scripts[i])

    # Send to cblparallel and save output_files
    if verbose:
        print 'Sending scripts to cblparallel'
    if local_computation:
        output_files = cblparallel.run_batch_locally(scripts,
                                                     language='matlab',
                                                     max_cpu=1.1,
                                                     job_check_sleep=5,
                                                     submit_sleep=0.1,
                                                     max_running_jobs=10,
                                                     verbose=verbose)
    else:
        output_files = cblparallel.run_batch_on_fear(scripts,
                                                     language='matlab',
                                                     max_jobs=max_jobs,
                                                     verbose=verbose,
                                                     zip_files=zip_files,
                                                     bundle_size=bundle_size)

    # Read in results
    results = [None] * len(models)
    for (i, output_file) in enumerate(output_files):
        if verbose:
            print 'Reading output file %d of %d' % (i + 1, len(models))
        results[i] = GPModel.from_matlab_output(gpml.read_outputs(output_file),
                                                models[i], ndata)

    # Tidy up local output files
    for (i, output_file) in enumerate(output_files):
        if verbose:
            print 'Removing output file %d of %d' % (i + 1, len(models))
        os.remove(output_file)
    # Remove temporary data file (perhaps on the cluster server)
    cblparallel.remove_temp_file(data_file, local_computation)

    # Return results i.e. list of ScoredKernel objects
    return results
def evaluate_models(models, X, y, verbose=True, iters=300, local_computation=False, zip_files=False, max_jobs=500, random_seed=0, subset=False, subset_size=250, full_iters=0, bundle_size=1):
   
    # Make data into matrices in case they're unidimensional.
    if X.ndim == 1: X = X[:, nax]
    if y.ndim == 1: y = y[:, nax]
    ndata = y.shape[0]
    
    # Create data file
    if verbose:
        print 'Creating data file locally'
    data_file = cblparallel.create_temp_file('.mat')

    scipy.io.savemat(data_file, {'X': X, 'y': y})
    
    # Move to fear if necessary
    if not local_computation:
        if verbose:
            print 'Moving data file to fear'
        cblparallel.copy_to_remote(data_file)
    
    # Create a list of MATLAB scripts to assess and optimise parameters for each kernel
    if verbose:
        print 'Creating scripts'
    scripts = [None] * len(models)
    for (i, model) in enumerate(models):
        parameters = {'datafile': data_file.split('/')[-1],
                      'writefile': '%(output_file)s', # N.B. cblparallel manages output files
                      'gpml_path': cblparallel.gpml_path(local_computation),
                      'mean_syntax': model.mean.get_gpml_expression(dimensions=X.shape[1]),
                      'mean_params': '[ %s ]' % ' '.join(str(p) for p in model.mean.param_vector),
                      'kernel_syntax': model.kernel.get_gpml_expression(dimensions=X.shape[1]),
                      'kernel_params': '[ %s ]' % ' '.join(str(p) for p in model.kernel.param_vector),
                      'lik_syntax': model.likelihood.get_gpml_expression(dimensions=X.shape[1]),
                      'lik_params': '[ %s ]' % ' '.join(str(p) for p in model.likelihood.param_vector),
                      'inference': model.likelihood.gpml_inference_method,
                      'iters': str(iters),
                      'seed': str(np.random.randint(2**31)),
                      'subset': 'true' if subset else 'false',
                      'subset_size' : str(subset_size),
                      'full_iters' : str(full_iters)}

        scripts[i] = gpml.OPTIMIZE_KERNEL_CODE % parameters
        #### Need to be careful with % signs
        #### For the moment, cblparallel expects no single % signs - FIXME
        scripts[i] = re.sub('% ', '%% ', scripts[i])
    
    # Send to cblparallel and save output_files
    if verbose:
        print 'Sending scripts to cblparallel'
    if local_computation:
        output_files = cblparallel.run_batch_locally(scripts, language='matlab', max_cpu=1.1, job_check_sleep=5, submit_sleep=0.1, max_running_jobs=10, verbose=verbose)  
    else:
        output_files = cblparallel.run_batch_on_fear(scripts, language='matlab', max_jobs=max_jobs, verbose=verbose, zip_files=zip_files, bundle_size=bundle_size)  
    
    # Read in results
    results = [None] * len(models)
    for (i, output_file) in enumerate(output_files):
        if verbose:
            print 'Reading output file %d of %d' % (i + 1, len(models))
        results[i] = GPModel.from_matlab_output(gpml.read_outputs(output_file), models[i], ndata)
    
    # Tidy up local output files
    for (i, output_file) in enumerate(output_files):
        if verbose:
            print 'Removing output file %d of %d' % (i + 1, len(models)) 
        os.remove(output_file)
    # Remove temporary data file (perhaps on the cluster server)
    cblparallel.remove_temp_file(data_file, local_computation)
    
    # Return results i.e. list of ScoredKernel objects
    return results
def evaluate_models(
    models,
    X,
    y,
    verbose=True,
    iters=300,
    local_computation=False,
    zip_files=False,
    max_jobs=500,
    random_seed=0,
    subset=False,
    subset_size=250,
    full_iters=0,
    bundle_size=1,
):

    # Make data into matrices in case they're unidimensional.
    if X.ndim == 1:
        X = X[:, nax]
    if y.ndim == 1:
        y = y[:, nax]
    ndata = y.shape[0]

    # Create data file
    if verbose:
        print "Creating data file locally"
    data_file = cblparallel.create_temp_file(".mat")

    scipy.io.savemat(data_file, {"X": X, "y": y})

    # Move to fear if necessary
    if not local_computation:
        if verbose:
            print "Moving data file to fear"
        cblparallel.copy_to_remote(data_file)

    # Create a list of MATLAB scripts to assess and optimise parameters for each kernel
    if verbose:
        print "Creating scripts"
    scripts = [None] * len(models)
    for (i, model) in enumerate(models):
        parameters = {
            "datafile": data_file.split("/")[-1],
            "writefile": "%(output_file)s",  # N.B. cblparallel manages output files
            "gpml_path": cblparallel.gpml_path(local_computation),
            "mean_syntax": model.mean.get_gpml_expression(dimensions=X.shape[1]),
            "mean_params": "[ %s ]" % " ".join(str(p) for p in model.mean.param_vector),
            "kernel_syntax": model.kernel.get_gpml_expression(dimensions=X.shape[1]),
            "kernel_params": "[ %s ]" % " ".join(str(p) for p in model.kernel.param_vector),
            "lik_syntax": model.likelihood.get_gpml_expression(dimensions=X.shape[1]),
            "lik_params": "[ %s ]" % " ".join(str(p) for p in model.likelihood.param_vector),
            "inference": model.likelihood.gpml_inference_method,
            "iters": str(iters),
            "seed": str(np.random.randint(2 ** 31)),
            "subset": "true" if subset else "false",
            "subset_size": str(subset_size),
            "full_iters": str(full_iters),
        }

        scripts[i] = gpml.OPTIMIZE_KERNEL_CODE % parameters
        #### Need to be careful with % signs
        #### For the moment, cblparallel expects no single % signs - FIXME
        scripts[i] = re.sub("% ", "%% ", scripts[i])

    # Send to cblparallel and save output_files
    if verbose:
        print "Sending scripts to cblparallel"
    if local_computation:
        output_files = cblparallel.run_batch_locally(
            scripts,
            language="matlab",
            max_cpu=1.1,
            job_check_sleep=5,
            submit_sleep=0.1,
            max_running_jobs=10,
            verbose=verbose,
        )
    else:
        output_files = cblparallel.run_batch_on_fear(
            scripts, language="matlab", max_jobs=max_jobs, verbose=verbose, zip_files=zip_files, bundle_size=bundle_size
        )

    # Read in results
    results = [None] * len(models)
    for (i, output_file) in enumerate(output_files):
        if verbose:
            print "Reading output file %d of %d" % (i + 1, len(models))
        results[i] = GPModel.from_matlab_output(gpml.read_outputs(output_file), models[i], ndata)

    # Tidy up local output files
    for (i, output_file) in enumerate(output_files):
        if verbose:
            print "Removing output file %d of %d" % (i + 1, len(models))
        os.remove(output_file)
    # Remove temporary data file (perhaps on the cluster server)
    cblparallel.remove_temp_file(data_file, local_computation)

    # Return results i.e. list of ScoredKernel objects
    return results
Exemple #10
0
def covariance_distance(kernels, X, local_computation=True, verbose=True):
    '''
    Evaluate a distance matrix of kernels, in terms of their covariance matrix evaluated on training inputs
    Input:
     - kernels           - A list of fk.ScoredKernel
     - X                 - A matrix (data_points x dimensions) of input locations
     - local_computation - Boolean indicating if computation should be performed on cluster or on local machine
    Return:
     - A matrix of similarities between the input kernels
    '''
    assert (len(kernels) > 0
            )  #### FIXME - This sort of check should happen earlier
    # Make data into matrices in case they're unidimensional.
    if X.ndim == 1: X = X[:, nax]
    # Save temporary data file in standard temporary directory
    data_file = cblparallel.create_temp_file('.mat')
    scipy.io.savemat(data_file, {'X': X})
    # Copy onto cluster server if necessary
    if not local_computation:
        if verbose:
            print 'Moving data file to fear'
        cblparallel.copy_to_remote(data_file)
    # Construct testing code
    code = gpml.DISTANCE_CODE_HEADER % {
        'datafile': data_file.split('/')[-1],
        'gpml_path': cblparallel.gpml_path(local_computation)
    }
    for (i, kernel) in enumerate([k.k_opt for k in kernels]):
        code = code + gpml.DISTANCE_CODE_COV % {
            'iter':
            i + 1,
            'kernel_family':
            kernel.gpml_kernel_expression(),
            'kernel_params':
            '[ %s ]' % ' '.join(str(p) for p in kernel.param_vector())
        }
    code = code + gpml.DISTANCE_CODE_FOOTER_HIGH_MEM % {
        'writefile': '%(output_file)s'
    }  # N.B. cblparallel manages output files
    code = re.sub('% ', '%% ',
                  code)  # HACK - cblparallel not fond of % signs at the moment
    # Run code - either locally or on cluster - returning location of output file
    if local_computation:
        output_file = cblparallel.run_batch_locally([code],
                                                    language='matlab',
                                                    max_cpu=1.1,
                                                    max_mem=1.1,
                                                    job_check_sleep=30,
                                                    verbose=verbose,
                                                    single_thread=False)[0]
    else:
        output_file = cblparallel.run_batch_on_fear([code],
                                                    language='matlab',
                                                    max_jobs=500,
                                                    verbose=verbose)[0]
    # Read in results from experiment
    gpml_result = scipy.io.loadmat(output_file)
    distance = gpml_result['sim_matrix']
    # Remove temporary files (perhaps on the cluster server)
    cblparallel.remove_temp_file(output_file, local_computation)
    cblparallel.remove_temp_file(data_file, local_computation)
    # Return distance matrix
    return distance
Exemple #11
0
def make_predictions(X,
                     y,
                     Xtest,
                     ytest,
                     best_scored_kernel,
                     local_computation=False,
                     max_jobs=500,
                     verbose=True,
                     zero_mean=False,
                     random_seed=0):
    '''
    Evaluates a kernel on held out data
    Input:
     - X                  - A matrix (data_points x dimensions) of input locations
     - y                  - A matrix (data_points x 1) of output values
     - Xtest              - Held out X data
     - ytest              - Held out y data
     - best_scored_kernel - A Scored Kernel object to be evaluated on the held out data
     - ...
    Return:
     - A dictionary of results from the MATLAB script containing:
       - loglik - an array of log likelihoods of test data
       - predictions - an array of mean predictions for the held out data
       - actuals - ytest
       - model - I'm not sure FIXME
       - timestamp - A time stamp of some sort
    '''
    # Make data into matrices in case they're unidimensional.
    if X.ndim == 1: X = X[:, nax]
    if y.ndim == 1: y = y[:, nax]
    ndata = y.shape[0]
    # Save temporary data file in standard temporary directory
    data_file = cblparallel.create_temp_file('.mat')
    scipy.io.savemat(data_file, {
        'X': X,
        'y': y,
        'Xtest': Xtest,
        'ytest': ytest
    })
    # Copy onto cluster server if necessary
    if not local_computation:
        if verbose:
            print 'Moving data file to fear'
        cblparallel.copy_to_remote(data_file)
    # Create prediction code
    parameters = {
        'datafile':
        data_file.split('/')[-1],
        'writefile':
        '%(output_file)s',
        'gpml_path':
        cblparallel.gpml_path(local_computation),
        'kernel_family':
        best_scored_kernel.k_opt.gpml_kernel_expression(),
        'kernel_params':
        '[ %s ]' %
        ' '.join(str(p) for p in best_scored_kernel.k_opt.param_vector()),
        'noise':
        str(best_scored_kernel.noise),
        'iters':
        str(30),
        'seed':
        str(random_seed)
    }
    if zero_mean:
        code = gpml.PREDICT_AND_SAVE_CODE_ZERO_MEAN % parameters
    else:
        code = gpml.PREDICT_AND_SAVE_CODE % parameters
    code = re.sub('% ', '%% ',
                  code)  # HACK - cblparallel currently does not like % signs
    # Evaluate code - potentially on cluster
    if local_computation:
        temp_results_file = cblparallel.run_batch_locally([code],
                                                          language='matlab',
                                                          max_cpu=1.1,
                                                          max_mem=1.1,
                                                          verbose=verbose)[0]
    else:
        temp_results_file = cblparallel.run_batch_on_fear([code],
                                                          language='matlab',
                                                          max_jobs=max_jobs,
                                                          verbose=verbose)[0]
    results = scipy.io.loadmat(temp_results_file)
    # Remove temporary files (perhaps on the cluster server)
    cblparallel.remove_temp_file(temp_results_file, local_computation)
    cblparallel.remove_temp_file(data_file, local_computation)
    # Return dictionary of MATLAB results
    return results
Exemple #12
0
def evaluate_kernels(kernels,
                     X,
                     y,
                     verbose=True,
                     noise=None,
                     iters=300,
                     local_computation=False,
                     zip_files=False,
                     max_jobs=500,
                     zero_mean=False,
                     random_seed=0):
    '''
    Sets up the kernel optimisation and nll calculation experiments, returns the results as scored kernels
    Input:
     - kernels           - A list of kernels (i.e. not scored kernels)
     - X                 - A matrix (data_points x dimensions) of input locations
     - y                 - A matrix (data_points x 1) of output values
     - ...
    Return:
     - A list of ScoredKernel objects
    '''

    # Make data into matrices in case they're unidimensional.
    if X.ndim == 1: X = X[:, nax]
    if y.ndim == 1: y = y[:, nax]
    ndata = y.shape[0]

    # Set default noise using a heuristic.
    if noise is None:
        noise = np.log(np.var(y) / 10)

    # Create data file
    if verbose:
        print 'Creating data file locally'
    data_file = cblparallel.create_temp_file('.mat')
    scipy.io.savemat(data_file, {'X': X, 'y': y})  # Save regression data

    # Move to fear if necessary
    if not local_computation:
        if verbose:
            print 'Moving data file to fear'
        cblparallel.copy_to_remote(data_file)

    # Create a list of MATLAB scripts to assess and optimise parameters for each kernel
    if verbose:
        print 'Creating scripts'
    scripts = [None] * len(kernels)
    for (i, kernel) in enumerate(kernels):
        x = kernel.param_vector()
        parameters = {
            'datafile':
            data_file.split('/')[-1],
            'writefile':
            '%(output_file)s',  # N.B. cblparallel manages output files
            'gpml_path':
            cblparallel.gpml_path(local_computation),
            'kernel_family':
            kernel.gpml_kernel_expression(),
            'kernel_params':
            '[ %s ]' % ' '.join(str(p) for p in x if type(p) != list),
            'eff_dimensions':
            '[ %s ]' % ';'.join(str(p) for p in x if type(p) == list),
            'dim_positions':
            '[ %s ]' %
            ' '.join(str(i) for i in range(len(x)) if type(x[i]) == list),
            'noise':
            str(noise),
            'iters':
            str(iters),
            'seed':
            str(random_seed)
        }
        print parameters['kernel_params'], parameters[
            'eff_dimensions'], parameters['dim_positions']
        if zero_mean:
            scripts[i] = gpml.OPTIMIZE_KERNEL_CODE_ZERO_MEAN % parameters
        else:
            scripts[i] = gpml.OPTIMIZE_KERNEL_CODE % parameters
        #### Need to be careful with % signs
        #### For the moment, cblparallel expects no single % signs - FIXME
        scripts[i] = re.sub('% ', '%% ', scripts[i])

    # Send to cblparallel and save output_files
    if verbose:
        print 'Sending scripts to cblparallel'
    if local_computation:
        output_files = cblparallel.run_batch_locally(scripts,
                                                     language='matlab',
                                                     max_cpu=1.1,
                                                     job_check_sleep=5,
                                                     submit_sleep=0.1,
                                                     max_running_jobs=10,
                                                     verbose=verbose)
    else:
        output_files = cblparallel.run_batch_on_fear(scripts,
                                                     language='matlab',
                                                     max_jobs=max_jobs,
                                                     verbose=verbose,
                                                     zip_files=zip_files)

    # Read in results
    results = [None] * len(kernels)
    for (i, output_file) in enumerate(output_files):
        if verbose:
            print 'Reading output file %d of %d: %s' % (i + 1, len(kernels),
                                                        output_file)
        results[i] = ScoredKernel.from_matlab_output(
            gpml.read_outputs(output_file), kernels[i].family(), ndata)

    # Tidy up local output files
    for (i, output_file) in enumerate(output_files):
        if verbose:
            print 'Removing output file %d of %d' % (i + 1, len(kernels))
        os.remove(output_file)
    # Remove temporary data file (perhaps on the cluster server)
    cblparallel.remove_temp_file(data_file, local_computation)

    # Return results i.e. list of ScoredKernel objects
    return results
def evaluate_kernels(kernels, X, y, verbose=True, noise=None, iters=300, local_computation=False, zip_files=False, max_jobs=500, zero_mean=False, random_seed=0):
    '''
    Sets up the kernel optimisation and nll calculation experiments, returns the results as scored kernels
    Input:
     - kernels           - A list of kernels (i.e. not scored kernels)
     - X                 - A matrix (data_points x dimensions) of input locations
     - y                 - A matrix (data_points x 1) of output values
     - ...
    Return:
     - A list of ScoredKernel objects
    '''
   
    # Make data into matrices in case they're unidimensional.
    if X.ndim == 1: X = X[:, nax]
    if y.ndim == 1: y = y[:, nax]
    ndata = y.shape[0]
        
    # Set default noise using a heuristic.    
    if noise is None:
        noise = np.log(np.var(y)/10)
    
    # Create data file
    if verbose:
        print 'Creating data file locally'
    data_file = cblparallel.create_temp_file('.mat')
    scipy.io.savemat(data_file, {'X': X, 'y': y}) # Save regression data
    
    # Move to fear if necessary
    if not local_computation:
        if verbose:
            print 'Moving data file to fear'
        cblparallel.copy_to_remote(data_file)
    
    # Create a list of MATLAB scripts to assess and optimise parameters for each kernel
    if verbose:
        print 'Creating scripts'
    scripts = [None] * len(kernels)
    for (i, kernel) in enumerate(kernels):
        parameters = {'datafile': data_file.split('/')[-1],
                      'writefile': '%(output_file)s', # N.B. cblparallel manages output files
                      'gpml_path': cblparallel.gpml_path(local_computation),
                      'kernel_family': kernel.gpml_kernel_expression(),
                      'kernel_params': '[ %s ]' % ' '.join(str(p) for p in kernel.param_vector()),
                      'noise': str(noise),
                      'iters': str(iters),
                      'seed': str(random_seed)}
        if zero_mean:
            scripts[i] = gpml.OPTIMIZE_KERNEL_CODE_ZERO_MEAN % parameters
        else:
            scripts[i] = gpml.OPTIMIZE_KERNEL_CODE % parameters
        #### Need to be careful with % signs
        #### For the moment, cblparallel expects no single % signs - FIXME
        scripts[i] = re.sub('% ', '%% ', scripts[i])
    
    # Send to cblparallel and save output_files
    if verbose:
        print 'Sending scripts to cblparallel'
    if local_computation:
        output_files = cblparallel.run_batch_locally(scripts, language='matlab', max_cpu=1.1, job_check_sleep=5, submit_sleep=0.1, max_running_jobs=10, verbose=verbose)  
    else:
        output_files = cblparallel.run_batch_on_fear(scripts, language='matlab', max_jobs=max_jobs, verbose=verbose, zip_files=zip_files)  
    
    # Read in results
    results = [None] * len(kernels)
    for (i, output_file) in enumerate(output_files):
        if verbose:
            print 'Reading output file %d of %d' % (i + 1, len(kernels))
        results[i] = ScoredKernel.from_matlab_output(gpml.read_outputs(output_file), kernels[i].family(), ndata)
    
    # Tidy up local output files
    for (i, output_file) in enumerate(output_files):
        if verbose:
            print 'Removing output file %d of %d' % (i + 1, len(kernels)) 
        os.remove(output_file)
    # Remove temporary data file (perhaps on the cluster server)
    cblparallel.remove_temp_file(data_file, local_computation)
    
    # Return results i.e. list of ScoredKernel objects
    return results