def test_to_from_hdf(self, tmpdir):
     file_name = str(tmpdir) + "/penguin.hdf"
     variable_names = [
         'lst1', 'lst2', 'lst3', 'snow1', 'snow2', 'ndvi1', 'ndvi2',
         'ndvi3', 'ndvi4', 'ndvi5'
     ]
     dat = np.random.rand(10, 11)
     years = range(10)
     learning_data = ld.LearningData()
     learning_data.from_data(dat, variable_names, 'penguin')
     learning_data.meta_layers['years'] = np.array(years)
     learning_data.attributes = {'penguin': 'yes', 'tophat': 'no'}
     learning_data.to_hdf(file_name)
     training_data = ld.LearningData()
     training_data.from_hdf(file_name)
     assert training_data.num_variables == 10
     assert training_data.num_observations == 10
     npt.assert_array_equal(training_data.variable_names, [
         'lst1', 'lst2', 'lst3', 'snow1', 'snow2', 'ndvi1', 'ndvi2',
         'ndvi3', 'ndvi4', 'ndvi5'
     ])
     npt.assert_array_equal(training_data.unique_variable_prefixes,
                            ['lst', 'snow', 'ndvi'])
     npt.assert_array_equal(training_data.variable_type_indices, [2, 4, 9])
     assert training_data.name == 'penguin'
     npt.assert_array_equal(training_data.meta_layers['years'],
                            np.array(years))
     npt.assert_array_equal(training_data.predictors, dat[:, :-1])
     npt.assert_array_equal(training_data.response, dat[:, -1])
     npt.assert_array_equal(training_data.design_matrix.dat, dat)
     assert training_data.attributes['penguin'] == 'yes'
     assert training_data.attributes['tophat'] == 'no'
Exemple #2
0
def change_basis(training_data_path, feature_file_path, output_file_path):
    training_data = ld.LearningData()
    training_data.from_file(training_data_path)
    pset = experiment.get_pset(training_data.num_variables, training_data.variable_type_indices,
                               training_data.variable_names, training_data.variable_dict)
    pset.addPrimitive(numpy.power, 2)
    predictors_transformed, response_transformed = experiment.transform_features(training_data.predictors,
                                                                                 training_data.response)[0:2]
    validation_toolbox = experiment.get_validation_toolbox(predictors_transformed, response_transformed, pset)
    features = get_features(feature_file_path, training_data.unique_variable_prefixes, pset,
                            validation_toolbox)
    basis, feature_names = build_basis_and_feature_names(features, training_data.num_observations, validation_toolbox,
                                                         response_transformed)
    basis_data = ld.LearningData()
    basis_data.from_data(basis, feature_names, output_file_path)
    basis_data.to_hdf(output_file_path)
 def test_good_values(self):
     dat = np.random.rand(10, 11)
     dat[1, 2] = np.inf
     dat[5, 3] = -np.inf
     dat[7, 9] = np.nan
     dat[9, 3] = np.inf
     learning_data = ld.LearningData()
     learning_data.from_data(dat, None, 'penguin')
     assert np.isnan(learning_data.predictors).any() == False
     assert np.isfinite(learning_data.predictors).all() == True
def build_random_training_data(shape):
    training_data = learning_data.LearningData()
    training_data.num_observations = shape[0]
    training_data.num_variables = shape[1]
    training_data.predictors = numpy.random.rand(
        training_data.num_observations, training_data.num_variables)
    training_data.response = numpy.random.rand(training_data.num_observations,
                                               1)
    training_data.variable_names = utilities.design_matrix.get_simple_variable_names(
        training_data.num_variables)
    training_data.variable_type_indices = \
        learning_data.get_default_variable_type_indices(training_data.num_variables)
    training_data.variable_dict = utilities.learning_data.get_simple_variable_dict(
        training_data.num_variables)
    return training_data
Exemple #5
0
    most_frequent_terms = sorted(term_frequency,
                                 key=term_frequency.get,
                                 reverse=True)
    for term in most_frequent_terms:
        logging.info("Term : Frequency : Coefficients")
        logging.info("{} : {} : {}".format(term, term_frequency[term],
                                           term_coefficients[term]))
    term_counts = [term_frequency[term] for term in most_frequent_terms]
    term_performances = [
        numpy.mean(term_performances[term]) for term in most_frequent_terms
    ]
    return most_frequent_terms, term_counts, term_performances


training_data = learning_data.LearningData()
training_data.from_file(args.training)
pset = experiment.get_pset(training_data.num_variables,
                           training_data.variable_type_indices,
                           training_data.variable_names,
                           training_data.variable_dict)
predictors_transformed, response_transformed = experiment.transform_features(
    training_data.predictors, training_data.response)[0:2]
pareto_files = lib.get_pareto_files(args.results, args.experiment,
                                    training_data.name)
logging.info("Number of pareto files = {}".format(len(pareto_files)))
validation_toolbox = experiment.get_validation_toolbox(predictors_transformed,
                                                       response_transformed,
                                                       pset)
front = gp_processing_tools.validate_pareto_optimal_inds(
    pareto_files, pset=pset, toolbox=validation_toolbox)
def build_hdf(file_name, matrix, years, names):
    learning_data = ld.LearningData()
    learning_data.from_data(matrix, names, 'none')
    learning_data.meta_layers['years'] = years
    learning_data.attributes = get_hdf_attributes()
    learning_data.to_hdf(file_name)