def test_to_from_hdf(self, tmpdir): file_name = str(tmpdir) + "/penguin.hdf" variable_names = [ 'lst1', 'lst2', 'lst3', 'snow1', 'snow2', 'ndvi1', 'ndvi2', 'ndvi3', 'ndvi4', 'ndvi5' ] dat = np.random.rand(10, 11) years = range(10) learning_data = ld.LearningData() learning_data.from_data(dat, variable_names, 'penguin') learning_data.meta_layers['years'] = np.array(years) learning_data.attributes = {'penguin': 'yes', 'tophat': 'no'} learning_data.to_hdf(file_name) training_data = ld.LearningData() training_data.from_hdf(file_name) assert training_data.num_variables == 10 assert training_data.num_observations == 10 npt.assert_array_equal(training_data.variable_names, [ 'lst1', 'lst2', 'lst3', 'snow1', 'snow2', 'ndvi1', 'ndvi2', 'ndvi3', 'ndvi4', 'ndvi5' ]) npt.assert_array_equal(training_data.unique_variable_prefixes, ['lst', 'snow', 'ndvi']) npt.assert_array_equal(training_data.variable_type_indices, [2, 4, 9]) assert training_data.name == 'penguin' npt.assert_array_equal(training_data.meta_layers['years'], np.array(years)) npt.assert_array_equal(training_data.predictors, dat[:, :-1]) npt.assert_array_equal(training_data.response, dat[:, -1]) npt.assert_array_equal(training_data.design_matrix.dat, dat) assert training_data.attributes['penguin'] == 'yes' assert training_data.attributes['tophat'] == 'no'
def change_basis(training_data_path, feature_file_path, output_file_path): training_data = ld.LearningData() training_data.from_file(training_data_path) pset = experiment.get_pset(training_data.num_variables, training_data.variable_type_indices, training_data.variable_names, training_data.variable_dict) pset.addPrimitive(numpy.power, 2) predictors_transformed, response_transformed = experiment.transform_features(training_data.predictors, training_data.response)[0:2] validation_toolbox = experiment.get_validation_toolbox(predictors_transformed, response_transformed, pset) features = get_features(feature_file_path, training_data.unique_variable_prefixes, pset, validation_toolbox) basis, feature_names = build_basis_and_feature_names(features, training_data.num_observations, validation_toolbox, response_transformed) basis_data = ld.LearningData() basis_data.from_data(basis, feature_names, output_file_path) basis_data.to_hdf(output_file_path)
def test_good_values(self): dat = np.random.rand(10, 11) dat[1, 2] = np.inf dat[5, 3] = -np.inf dat[7, 9] = np.nan dat[9, 3] = np.inf learning_data = ld.LearningData() learning_data.from_data(dat, None, 'penguin') assert np.isnan(learning_data.predictors).any() == False assert np.isfinite(learning_data.predictors).all() == True
def build_random_training_data(shape): training_data = learning_data.LearningData() training_data.num_observations = shape[0] training_data.num_variables = shape[1] training_data.predictors = numpy.random.rand( training_data.num_observations, training_data.num_variables) training_data.response = numpy.random.rand(training_data.num_observations, 1) training_data.variable_names = utilities.design_matrix.get_simple_variable_names( training_data.num_variables) training_data.variable_type_indices = \ learning_data.get_default_variable_type_indices(training_data.num_variables) training_data.variable_dict = utilities.learning_data.get_simple_variable_dict( training_data.num_variables) return training_data
most_frequent_terms = sorted(term_frequency, key=term_frequency.get, reverse=True) for term in most_frequent_terms: logging.info("Term : Frequency : Coefficients") logging.info("{} : {} : {}".format(term, term_frequency[term], term_coefficients[term])) term_counts = [term_frequency[term] for term in most_frequent_terms] term_performances = [ numpy.mean(term_performances[term]) for term in most_frequent_terms ] return most_frequent_terms, term_counts, term_performances training_data = learning_data.LearningData() training_data.from_file(args.training) pset = experiment.get_pset(training_data.num_variables, training_data.variable_type_indices, training_data.variable_names, training_data.variable_dict) predictors_transformed, response_transformed = experiment.transform_features( training_data.predictors, training_data.response)[0:2] pareto_files = lib.get_pareto_files(args.results, args.experiment, training_data.name) logging.info("Number of pareto files = {}".format(len(pareto_files))) validation_toolbox = experiment.get_validation_toolbox(predictors_transformed, response_transformed, pset) front = gp_processing_tools.validate_pareto_optimal_inds( pareto_files, pset=pset, toolbox=validation_toolbox)
def build_hdf(file_name, matrix, years, names): learning_data = ld.LearningData() learning_data.from_data(matrix, names, 'none') learning_data.meta_layers['years'] = years learning_data.attributes = get_hdf_attributes() learning_data.to_hdf(file_name)