X_train, X_test, Y_train, Y_test = preprocess_data_holdout_middle(dataset)
elif true_middle_eval:
	X_train, X_test, Y_train, Y_test = preprocess_data_holdout_true_middle(dataset)
else:
	print("Unable to find specified testing set. Terminating early.")
	sys.exit()

slack_values = dataset.loc[:, "κ"]

slack_test_values = dataset.loc[X_test.axes[0].tolist(), "κ"]
slack_train_values = dataset.loc[~dataset["κ"].isin(slack_test_values), "κ"]

holdout_data = pd.concat([X_test, slack_test_values, Y_test], axis=1)
training_data = pd.concat([X_train, slack_train_values, Y_train], axis=1)

sr = SymbolicRegression(ngen=500, pop_size=2000, max_height=7, mutation_probability=0.3, crossover_probability=0.7)

#Kfolds cross validation, with 5 splits
kf = KFold(n_splits=5)

scores = []
scores_on_holdout_set = []
slack_scores = []

# Loop through the KFolds splits, and train/test the model over them
for train_index, test_index in kf.split(X_train):
	X_temp_train, X_temp_test = X_train.values[train_index], X_train.values[test_index]
	Y_temp_train, Y_temp_test = Y_train.values[train_index], Y_train.values[test_index]
	sr.fit(X_temp_train, Y_temp_train)
	score, slack_score = calc_score(sr, X_temp_test, Y_temp_test, test_index)
	scores_on_holdout_set.append(mean_absolute_error(Y_test, sr.predict(X_test.values)))
Exemple #2
0
training_data = LearningData()
training_data.from_file(args.data)
X_train, X_test, y_train, y_test = train_test_split(training_data.predictors,
                                                    training_data.response,
                                                    test_size=0.2,
                                                    shuffle=False)
print('Training examples: ' + str(X_train.shape[0]))
print('Testing examples: ' + str(X_test.shape[0]))
model = SymbolicRegression(
    experiment_class=experiment_class,
    variable_type_indices=training_data.variable_type_indices,
    variable_names=training_data.variable_names,
    variable_dict=training_data.variable_dict,
    num_features=training_data.num_variables,
    pop_size=100,
    ngen=1000,
    crossover_probability=.5,
    mutation_probability=.5,
    subset_proportion=.7,
    ensemble_size=1,
    seed=args.seed)
model.fit(X_train, y_train)
validation_error = model.score(X_train, y_train)
test_error = model.score(X_test, y_test)
print('Model validation error: ' + str(validation_error))
print('Model test error: ' + str(test_error))
ident = experiment_name + '_' + training_data.name + '_'
model.save(args.model + '/' + ident + str(args.seed))
if args.output:
    with open(args.output + '/' + ident + 'validation.txt', 'a') as f:
Exemple #3
0
slack_values = dataset.loc[:, "κ"]

X_train = pd.DataFrame().reindex_like(X)
X_test = pd.DataFrame().reindex_like(X)
Y_train = pd.Series(name="κref.")
Y_test = pd.Series(name="κref.")

X_train = X_train.iloc[0:0]
X_test = X_test.iloc[0:0]

slack_train_values = pd.Series()
slack_test_values = pd.Series()

sr = SymbolicRegression(ngen=500,
                        pop_size=2000,
                        max_height=7,
                        mutation_probability=0.3,
                        crossover_probability=0.7)

#Kfolds cross validation, with 10 splits
kf = KFold(n_splits=10, shuffle=True)

scores = []
slack_scores = []

total_sum = 0

# Loop through the KFolds splits, and train/test the model over them
for train_index, test_index in kf.split(X):
    X_temp_train, X_temp_test = X.iloc[train_index].values, X.iloc[
        test_index].values
Exemple #4
0
import numpy as np
import matplotlib.pyplot as plt

from fastsr.estimators.symbolic_regression import SymbolicRegression
from fastsr.containers.learning_data import LearningData
from fastsr.experiments.truncation_elite import TruncationElite
from experiments.truncation_elite_rt import TruncationEliteRT
import utils

best_num = 0
experiment_class, experiment_name = utils.get_experiment_class_and_name(TruncationEliteRT)
model = SymbolicRegression()
model.load('/home/cfusting/rtresults_1000_100/energy_lagged/' + experiment_name + '/saved_models/' +
           experiment_name + '_energy_lagged_3965.pkl')

training_data = LearningData()
#training_data.from_file('data/minimum.csv')
training_data.from_hdf('data/energy_lagged.hdf5')
experiment = experiment_class()
pset = experiment.get_pset(training_data.num_variables, training_data.variable_type_indices,
                           training_data.variable_names, training_data.variable_dict)
scoring_toolbox = experiment.get_scoring_toolbox(training_data.predictors, training_data.response, pset)
best_individuals = []
for individual in model.best_individuals_:
    if hasattr(individual, 'history_index'):
        best_individuals.append(individual)
best_genealogy = model.history_.getGenealogy(best_individuals[best_num])
errors = []


def populate_individuals(history_index):
Exemple #5
0
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

from fastsr.estimators.symbolic_regression import SymbolicRegression
from fastsr.containers.learning_data import LearningData

estimators = [('reduce_dim', PCA()), ('symbolic_regression', SymbolicRegression())]
pipe = Pipeline(estimators)

training_data = LearningData()
training_data.from_file('data/hour_simple_lagged.hdf5')
X_train, X_test, y_train, y_test = train_test_split(training_data.predictors, training_data.response, test_size=0.1,
                                                    shuffle=False)

pipe.fit(X_train, y_train)
print(pipe.score(X_train, y_train))

model = SymbolicRegression()
model.fit(X_train, y_train)
print(model.score(X_train, y_train))
Exemple #6
0

def get_ident(file):
    pattern = re.compile('(.+)_\d')
    match = pattern.search(file)
    return match.group(1)

ENSEMBLE_SIZES = [1, 5, 10, 20, 50]
files = [f for f in listdir(args.models) if isfile(join(args.models, f))]
files = [f for f in files if 'parameter' not in f]
training_data = LearningData()
training_data.from_file(args.data)
X_train, X_test, y_train, y_test = train_test_split(training_data.predictors, training_data.response,
                                                    test_size=0.2, shuffle=False)
with open(args.results + '/' + get_ident(files[0]) + '.csv', 'w') as results:
    header = 'Seed' + ',' + ','.join(['Ensemble' + str(x) for x in ENSEMBLE_SIZES])
    results.write(header)
    results.write('\n')
    for f in files:
        print('Loading file: ' + str(f))
        seed = get_seed(f)
        model = SymbolicRegression()
        model.load(args.models + '/' + f)
        test_errors = []
        for s in ENSEMBLE_SIZES:
            model.ensemble_size = s
            test_errors.append(str(model.score(X_test, y_test)))
        line = seed + ',' + ','.join(test_errors)
        results.write(line)
        results.write('\n')
Exemple #7
0
folder_removed = False

#Load the folders in the top_models_hybrid_cv folder, and sort them by ascending numerical value
try:
    folders = [x[1] for x in os.walk(model_dir)][0]
    folders_int = [[int(x), x] for x in folders]
    folders_int.sort()
    folders = [folders_int[x][1] for x in range(0, len(folders_int))]
except IndexError:
    print(
        "There are no models currently saved in the top_models_hybrid_cv folder."
    )
else:
    for folder in folders:
        sr = SymbolicRegression()
        try:
            sr.load(model_dir + "/" + folder + "/model")
            equation = simplify(str(sr.best_individuals_[0]))

            #Check if the equation exists already
            equation_exists = False
            for model in models:
                if model != [] and model[1] == equation:
                    equation_exists = True
                    break

            #If it already exists and the -d flag is set, delete it
            if equation_exists and delete_duplicates:
                shutil.rmtree(model_dir + "/" + folder)
                models.append([])
import numpy as np

from fastsr.estimators.symbolic_regression import SymbolicRegression

from fastgp.algorithms.fast_evaluate import fast_numpy_evaluate
from fastgp.parametrized.simple_parametrized_terminals import get_node_semantics


def target(x):
    return x**3 + x**2 + x

X = np.linspace(-10, 10, 100, endpoint=True)
y = target(X)

sr = SymbolicRegression(seed=72066)
sr.fit(X, y)
score = sr.score(X, y)
print('Score: ' + str(score))
print('Best Individuals:')
sr.print_best_individuals()

history = sr.history_
population = list(filter(lambda x: hasattr(x, 'error'), list(sr.history_.genealogy_history.values())))
population.sort(key=lambda x: x.error, reverse=True)

X = X.reshape((len(X), 1))
i = 1
previous_errror = population[0]
unique_individuals = []
while i < len(population):