def test_kneighbors_regressor(): # Test chaining KNeighborsTransformer and classifiers/regressors rng = np.random.RandomState(0) X = 2 * rng.rand(40, 5) - 1 X2 = 2 * rng.rand(40, 5) - 1 y = rng.rand(40, 1) n_neighbors = 12 radius = 1.5 # We precompute more neighbors than necessary, to have equivalence between # k-neighbors estimator after radius-neighbors transformer, and vice-versa. factor = 2 k_trans = KNeighborsTransformer(n_neighbors=n_neighbors, mode="distance") k_trans_factor = KNeighborsTransformer(n_neighbors=int(n_neighbors * factor), mode="distance") r_trans = RadiusNeighborsTransformer(radius=radius, mode="distance") r_trans_factor = RadiusNeighborsTransformer(radius=int(radius * factor), mode="distance") k_reg = KNeighborsRegressor(n_neighbors=n_neighbors) r_reg = RadiusNeighborsRegressor(radius=radius) test_list = [ (k_trans, k_reg), (k_trans_factor, r_reg), (r_trans, r_reg), (r_trans_factor, k_reg), ] for trans, reg in test_list: # compare the chained version and the compact version reg_compact = clone(reg) reg_precomp = clone(reg) reg_precomp.set_params(metric="precomputed") reg_chain = make_pipeline(clone(trans), reg_precomp) y_pred_chain = reg_chain.fit(X, y).predict(X2) y_pred_compact = reg_compact.fit(X, y).predict(X2) assert_array_almost_equal(y_pred_chain, y_pred_compact)
def test_onnxruntime_knn_radius(self): def _get_reg_data(self, n, n_features, n_targets, n_informative=10): X, y = make_regression( # pylint: disable=W0632 n, n_features=n_features, random_state=0, n_targets=n_targets, n_informative=n_informative) return X, y def _fit_model(model, n_targets=1, label_int=False, n_informative=10): X, y = _get_reg_data(20, 4, n_targets, n_informative) if label_int: y = y.astype(numpy.int64) model.fit(X, y) return model, X model, X = _fit_model(RadiusNeighborsRegressor()) model_onnx = to_onnx( model, X[:1].astype(numpy.float32), target_opset=TARGET_OPSET, options={id(model): {'optim': 'cdist'}}) oinf = OnnxInference(model_onnx, runtime='onnxruntime1') X = X[:7] got = oinf.run({'X': X.astype(numpy.float32)})['variable'] exp = model.predict(X.astype(numpy.float32)) if any(numpy.isnan(got.ravel())): # The model is unexpectedly producing nan values # sometimes. res = oinf.run({'X': X.astype(numpy.float32)}, intermediate=True) rows = ['--EXP--', str(exp), '--GOT--', str(got), '--EVERY-OUTPUT--'] for k, v in res.items(): rows.append('-%s-' % k) rows.append(str(v)) if any(map(numpy.isnan, res["variable"].ravel())): # raise AssertionError('\n'.join(rows)) warnings.warn("Unexpected NaN values\n" + '\n'.join(rows)) return # onnxruntime and mlprodict do not return the same # output warnings.warn('\n'.join(rows)) return self.assertEqualArray(exp, got, decimal=4)
def compare_multiple_stacks(folder): subfolders = os.listdir(folder) all_data = [] for subfolder in tqdm.tqdm(subfolders): all_data.append(load_images(os.path.join(folder, subfolder))) all_data = np.array(all_data) print(all_data.shape) for channel in range(3): for subfolder_index in range(all_data.shape[0]): channel_stack = all_data[subfolder_index][:, :, :, channel] img_mean = np.mean(channel_stack, axis=0) img_sigma_clip = np.mean(astropy.stats.sigma_clip(channel_stack, sigma=2, axis=0), axis=0) img_sigma_ratio = (img_mean / img_sigma_clip - 1) * 1E3 skip = 1 flat_ratios = img_sigma_ratio.flatten()[::skip] mean_values = img_mean.flatten()[::skip] # plt.scatter(mean_values, flat_ratios, alpha=0.1, color='black', s=1) rnr = RadiusNeighborsRegressor(radius=50, weights='uniform') rnr.fit(np.expand_dims(mean_values, axis=1), flat_ratios.flatten()) x = np.arange( np.min(mean_values) + 200, np.max(mean_values) + 1 - 200, 10) line_y = rnr.predict(np.expand_dims(x, axis=1)) plt.plot(x, line_y, label=str(subfolder_index)) plt.legend() plt.grid(True) plt.show()
def test_model_knn_regressor_radius(self): model, X = self._fit_model(RadiusNeighborsRegressor()) model_onnx = convert_sklearn(model, "KNN regressor", [("input", FloatTensorType([None, 4]))], target_opset=TARGET_OPSET, options={id(model): { 'optim': 'cdist' }}) sess = InferenceSession(model_onnx.SerializeToString()) got = sess.run(None, {'input': X.astype(numpy.float32)})[0] exp = model.predict(X.astype(numpy.float32)) if any(numpy.isnan(got.ravel())): # The model is unexpectedly producing nan values # not on all platforms. rows = [ '--EXP--', str(exp), '--GOT--', str(got), '--EVERY-OUTPUT--' ] for out in enumerate_model_node_outputs(model_onnx, add_node=False): onx = select_model_inputs_outputs(model_onnx, out) sess = InferenceSession(onx.SerializeToString()) res = sess.run(None, {'input': X.astype(numpy.float32)}) rows.append('--{}--'.format(out)) rows.append(str(res)) if (StrictVersion(onnxruntime.__version__) < StrictVersion("1.4.0")): return raise AssertionError('\n'.join(rows)) self.assertIsNotNone(model_onnx) dump_data_and_model(X.astype(numpy.float32)[:7], model, model_onnx, basename="SklearnRadiusNeighborsRegressor") dump_data_and_model((X + 0.1).astype(numpy.float32)[:7], model, model_onnx, basename="SklearnRadiusNeighborsRegressor")
def make_atmospheric_pressure_model(df): ds = load_ds(df, "pres") X_train, X_test, y_train, _ = ds # Build & fit the model model = make_pipeline( PCA(whiten=True), StandardScaler(), RadiusNeighborsRegressor(radius=0.014), ) model.fit(X_train, y_train) y_pred = model.predict(X_test) plot_regression( "atmospheric_pressure", "Atmospheric pressure (Pa)", dataset=ds, y_pred=y_pred, ) return model, ds, y_pred
def compare_error_vs_brightness(folder): data = load_images(folder) for channel in range(data.shape[3]): channel_stack = data[:, :, :, channel] img_mean = np.mean(channel_stack, axis=0) img_sigma_clip = np.mean(astropy.stats.sigma_clip(channel_stack, sigma=2, axis=0), axis=0) img_sigma_ratio = (img_mean / img_sigma_clip - 1) * 1E3 x = np.arange(np.min(img_mean), np.max(img_mean) + 1) bit_flip_change = 128 if channel == 1 else 256 y_top = ((channel_stack.shape[0] * x) / (channel_stack.shape[0] * x - bit_flip_change) - 1) * 1E3 y_bottom = ((channel_stack.shape[0] * x) / (channel_stack.shape[0] * x + bit_flip_change) - 1) * 1E3 plt.plot(x, y_top, 'r') plt.plot(x, y_bottom, 'r') plt.scatter(img_mean.flatten(), img_sigma_ratio.flatten(), alpha=0.1, color='black', s=1) rnr = RadiusNeighborsRegressor(radius=50, weights='distance') rnr.fit(np.expand_dims(img_mean.flatten(), axis=1), img_sigma_ratio.flatten()) x = np.arange(np.min(img_mean), np.max(img_mean) + 1) line_y = rnr.predict(np.expand_dims(x, axis=1)) plt.plot(x, line_y, 'g') plt.grid(True) plt.show()
def make_wind_speed_model(df): ds = load_ds(df, "ff") X_train, X_test, y_train, _ = ds # Build & fit the model model = make_pipeline( PCA(whiten=True), StandardScaler(), RadiusNeighborsRegressor(radius=0.02), ) model.fit(X_train, y_train) y_pred = model.predict(X_test) plot_regression( "wind_speed", "Average wind speed 10 min (m/s)", dataset=ds, y_pred=y_pred, ) return model, ds, y_pred
def modelBuild(self, train_x, train_y, selected_model='NN'): """训练模型""" if selected_model == 'NN': self.my_model = MLPRegressor(hidden_layer_sizes=(50), activation='relu', solver='adam', alpha=0.0001, batch_size='auto', learning_rate='adaptive', learning_rate_init=0.001) elif selected_model == 'DT': self.my_model = tree.DecisionTreeRegressor() elif selected_model == 'SVM': self.my_model = svm.SVR() elif selected_model == 'KNN': self.my_model == KNeighborsRegressor(n_neighbors=2) elif selected_model == 'RNN': self.my_model == RadiusNeighborsRegressor(radius=1.0) else: print "this model can not be built" return self.my_model.fit(train_x, train_y)
def test_model_knn_regressor2_1_radius(self): model, X = self._fit_model_simple( RadiusNeighborsRegressor(algorithm="brute"), n_targets=2) model_onnx = convert_sklearn( model, "KNN regressor", [("input", FloatTensorType([None, X.shape[1]]))], target_opset=TARGET_OPSET) self.assertIsNotNone(model_onnx) sess = InferenceSession(model_onnx.SerializeToString()) got = sess.run(None, {'input': X.astype(numpy.float32)})[0] exp = model.predict(X.astype(numpy.float32)) if any(numpy.isnan(got.ravel())): # The model is unexpectedly producing nan values # not on all platforms. # It happens when two matrices are multiplied, # one is (2, 20, 20), second is (20, 20) # and contains only 0 or 1 values. # The output contains nan values on the first row # but not on the second one. rows = [ '--EXP--', str(exp), '--GOT--', str(got), '--EVERY-OUTPUT--' ] for out in enumerate_model_node_outputs(model_onnx, add_node=False): onx = select_model_inputs_outputs(model_onnx, out) sess = InferenceSession(onx.SerializeToString()) res = sess.run(None, {'input': X.astype(numpy.float32)}) rows.append('--{}--'.format(out)) rows.append(str(res)) if (StrictVersion(onnxruntime.__version__) < StrictVersion("1.4.0")): return raise AssertionError('\n'.join(rows)) assert_almost_equal(exp, got, decimal=5)
def grid_points_2d(mesh, cell_size=10): grid = vtk_Voxel.from_mesh(mesh, cell_size, 2) cells = grid.cell_centers().points radius = cell_size * 0.5 tmat = np.full(cells.shape[0], np.nan) print("sample min", np.min(mesh.points[:, 2]), "max", np.max(mesh.points[:, 2])) while np.any(np.isnan(tmat)): # keep increasing radius until all cells have values radius *= 1.5 print("RadiusNeighborsRegressor =", radius, "m") neigh = RadiusNeighborsRegressor(radius, 'distance') neigh.fit(mesh.points[:, :2], mesh.points[:, 2]) rmat = neigh.predict(cells[:, :2]) np.putmask(tmat, np.isnan(tmat), rmat) print("regression min", np.min(tmat), "max", np.max(tmat)) grid.cell_arrays['Elevation'] = tmat surf = grid.extract_surface() surf = surf.ctp() surf.points[:, 2] = surf.point_arrays['Elevation'] return surf
def powerproduction(): if fl.request.method == "POST": speed = {} speed = float(fl.request.form['speed']) # speed = requests.get(data['input_s']) # import csv data and convert to pandas dataframe df = pd.read_csv("powerproduction.csv") # remove all zeros df = df[df.power != 0] # put rows in order of speed df = df.sort_values('speed') # set each column to a numpy array for processing S = df['speed'].to_numpy() p = df['power'].to_numpy() neigh_radius = RadiusNeighborsRegressor(radius=1.7, weights='distance', p = 2) neigh_radius.fit(S.reshape(-1, 1), p) p_pred = neigh_radius.predict([[speed]]) return {'value': p_pred[0]}
from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import ExtraTreesRegressor, AdaBoostRegressor, RandomForestRegressor, GradientBoostingRegressor from sklearn.neural_network import MLPRegressor from sklearn.svm import SVR from sklearn.linear_model import Ridge, Lasso, SGDRegressor, BayesianRidge from sklearn.neighbors import KNeighborsRegressor, RadiusNeighborsRegressor from sklearn.experimental import enable_hist_gradient_boosting from sklearn.ensemble import HistGradientBoostingRegressor from catboost import CatBoostRegressor from lightgbm import LGBMRegressor tree_regressors = { "Decision_tree_regressor": DecisionTreeRegressor(), "AdaBoost_regressor": AdaBoostRegressor(), "Extra_trees_regressor": ExtraTreesRegressor(), "Random_forest_regressor": RandomForestRegressor(), # Takes 55 seconds "GBM_regressor": GradientBoostingRegressor(), #Takes forever "HGB_regressor": HistGradientBoostingRegressor(), "CATBoost_regressor": CatBoostRegressor(verbose=0), "lightgbm_regressor": LGBMRegressor(), } mult_regeressors = { "Linear_regression": LinearRegression(), ### Dont use results were awful "Ridge_regressor": Ridge(), "SVM_regressor": SVR(), # Takes 150 seconds "MLP_regressor": MLPRegressor(), "SGD_regressor": SGDRegressor(), "KNN_regressor": KNeighborsRegressor(), "BR_regressor": BayesianRidge(), "RNN_regressor": RadiusNeighborsRegressor(), # Predicts NaN's :S }
# Available optimisation on this machine. print(code_optimisation()) ############################## # Building the model # ++++++++++++++++++ filename = "onnx_to_profile.onnx" if not os.path.exists(filename): print(f"Generate a graph for {filename!r}.") X = numpy.random.randn(1000, 10).astype(numpy.float64) y = X.sum(axis=1).reshape((-1, 1)) model = RadiusNeighborsRegressor() model.fit(X, y) onx = to_onnx(model, X, options={'optim': 'cdist'}) with open(filename, "wb") as f: f.write(onx.SerializeToString()) ##################################### # Functions # +++++++++ # # We need to generate random inputs to test the graph. def random_input(typ, shape, batch): if typ == 'tensor(double)':
def process_data(data): """ data: input_true, input_reco, ghost_label, group_label returns: input, output input: intersection between reco and true, labeled with reco charge depositions output: intersection between reco and true, labeled with adjusted energy depositions """ input_true = data['input_true'] input_reco = data['input_reco'] segment_label = data['segment_label'] group_label = data['group_label'] chosen_indices = [] chosen_reco_indices = [] current_batch = 0 current_batch_selection = np.where(input_true[:, -2] == current_batch)[0] current_input_true = input_true[current_batch_selection] for r in range(len(input_reco)): row = input_reco[r] b = row[-2] if b != current_batch: current_batch = b current_batch_selection = np.where( input_true[:, -2] == current_batch)[0] pos = row[:3] region_selection = np.where((current_input_true[:, 0] == pos[0]) & (current_input_true[:, 1] == pos[1]))[0] input_true_region = current_input_true[region_selection] for i in range(len(input_true_region)): row2 = input_true_region[i] pos2 = row2[:3] if np.array_equal(pos, pos2): chosen_indices.append( current_batch_selection[region_selection[i]]) chosen_reco_indices.append(r) break if len(chosen_indices) == 0: return None chosen_indices = np.array(chosen_indices) chosen_reco_indices = np.array(chosen_reco_indices) lost_data = np.delete(input_true, chosen_indices, axis=0) found_data = input_true[chosen_indices] # find where the chosen indices are in the group data lost_group_data = -np.ones((len(lost_data), len(lost_data[0]))) ungrouped_data = -np.ones((len(lost_data), len(lost_data[0]))) found_group_data = -np.ones((len(found_data), len(found_data[0]))) for i in range(len(lost_data)): row = lost_data[i] filter0 = group_label[np.where(group_label[:, -2] == row[-2])] filter1 = filter0[np.where(filter0[:, 0] == row[0])] filter2 = filter1[np.where(filter1[:, 1] == row[1])] filter3 = filter2[np.where(filter2[:, 2] == row[2])] if len(filter3) == 0: ungrouped_data[i] = row else: g = filter3[0] lost_group_data[i] = g for i in range(len(found_data)): row = found_data[i] filter0 = group_label[np.where(group_label[:, -2] == row[-2])] filter1 = filter0[np.where(filter0[:, 0] == row[0])] filter2 = filter1[np.where(filter1[:, 1] == row[1])] filter3 = filter2[np.where(filter2[:, 2] == row[2])] g = filter3[0] found_group_data[i] = g # lost_group_data = np.delete(group_label, chosen_indices, axis=0) # found_group_data = group_label[chosen_indices] if ADD_MISSING_ENERGY: batches = np.unique(input_true[:, 3]) for b in batches: # nearest neighbor assignment within group found_groups = np.unique( found_group_data[np.where(found_group_data[:, 3] == b)][:, -1]) lost_batch_mask = lost_group_data[:, 3] == b found_batch_mask = found_group_data[:, 3] == b for g in found_groups: lost_selection = np.where(lost_batch_mask & (lost_group_data[:, -1] == g))[0] found_selection = np.where(found_batch_mask & (found_group_data[:, -1] == g))[0] ldata = lost_data[lost_selection] fdata = found_data[found_selection] lost_positions = ldata[:, :3] found_positions = fdata[:, :3] distances = distance_matrix(lost_positions, found_positions) closest_points = np.argmin(distances, axis=1) closest_energies = ldata[:, -1] for i in range(len(closest_points)): found_data[found_selection[ closest_points[i]]][-1] += closest_energies[i] # associated ungrouped voxels with nearest voxels, regardless of group lost_ungrouped = np.where((ungrouped_data[:, 3] == b))[0] if len(lost_ungrouped) > 0: found_selection = np.where(found_batch_mask)[0] ldata = lost_data[lost_ungrouped] fdata = found_data[found_selection] lost_positions = ldata[:, :3] found_positions = fdata[:, :3] distances = distance_matrix(lost_positions, found_positions) closest_points = np.argmin(distances, axis=1) closest_energies = ldata[:, -1] for i in range(len(closest_points)): found_data[found_selection[ closest_points[i]]][-1] += closest_energies[i] if BLUR_ENERGY: blur_kernel = 3 for g in np.unique(found_group_data[:, -1]): inds = np.where(found_group_data[:, -1] == g) selection = found_data[inds] total_energy = np.sum(selection[:, -1]) coords = selection[:, :3] energies = selection[:, -1] neigh = RadiusNeighborsRegressor(radius=blur_kernel) neigh.fit(coords, energies) selection[:, -1] = neigh.predict(coords) selection[:, -1] *= total_energy / np.sum(selection[:, -1]) found_data[inds, -1] = selection[:, -1] segment_indices = segment_label[chosen_indices, -1].astype(int) segment_one_hot = np.zeros((len(segment_indices), 5)) segment_one_hot[np.arange(len(segment_indices)), segment_indices] = 1 out = np.concatenate((input_reco[chosen_reco_indices], segment_one_hot, np.expand_dims(found_data[:, -1], axis=1)), axis=1) return np.array(out), found_group_data[:, -1]
for i in range(0, len(y)-1): if y[i]>10000000: y[i]=10000000 ### RadiusNeighborsRegressor ### from sklearn.neighbors import RadiusNeighborsRegressor from sklearn.preprocessing import StandardScaler kf = KFold(len(y), n_folds=15, shuffle=True) y_pred = np.zeros(len(y), dtype=y.dtype) # where we'll accumulate predictions clf = RadiusNeighborsRegressor(radius=15) # CV Loop for train_index, test_index in kf: # for each iteration of the for loop we'll do a test train split X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] t = StandardScaler() X_train = t.fit_transform(X_train) clf.fit(X_train, y_train) # Train clf_1 on the training data X_test = t.transform(X_test)
model = LR(C=0.01, penalty='l1') from sklearn.linear_model import BayesianRidge as BR model = BR(alpha_1=1e2, alpha_2=3e2, lambda_1=1e-9, lambda_2=1e-9, compute_score=False) from sklearn.linear_model import (LinearRegression, Lasso, RandomizedLasso, Ridge) from sklearn.feature_selection import (RFE, f_regression) from sklearn.ensemble import RandomForestRegressor as rfr from sklearn.ensemble import AdaBoostRegressor from sklearn.neighbors import KNeighborsRegressor from sklearn.neighbors import RadiusNeighborsRegressor from sklearn.neighbors import NearestNeighbors model = RadiusNeighborsRegressor(radius=0.5, p=2) from sklearn.ensemble import RandomForestRegressor model=RandomForestRegressor(n_estimators=10,max_depth=8,\ min_samples_split=2) from sklearn.ensemble import AdaBoostRegressor model = AdaBoostRegressor(n_estimators=400) from sklearn.ensemble import GradientBoostingRegressor model=GradientBoostingRegressor(n_estimators=100,\ learning_rate=0.1,max_depth=10) from sklearn.ensemble import BaggingRegressor mb = model model=BaggingRegressor(base_estimator=mb,n_estimators=20,bootstrap=1,\ bootstrap_features=1,max_samples=0.3,max_features=0.3) model = LR(C=0.004) model = LR(C=0.01, penalty='l1') model=rfr(n_estimators=2000,max_depth=1,min_samples_leaf=20,\
'mse_train','mse_test','mae_train','mae_test', 'mdae_train','mdae_test'] reg=[linear_model.LinearRegression(), linear_model.Ridge(),linear_model.RidgeCV(), linear_model.Lasso(),linear_model.LassoLarsCV(), linear_model.RANSACRegressor(), linear_model.BayesianRidge(),linear_model.ARDRegression(), linear_model.HuberRegressor(),linear_model.TheilSenRegressor(), PLSRegression(),DecisionTreeRegressor(),ExtraTreeRegressor(), BaggingRegressor(),AdaBoostRegressor(), GradientBoostingRegressor(),RandomForestRegressor(), linear_model.PassiveAggressiveRegressor(max_iter=1000,tol=0.001), linear_model.ElasticNet(), linear_model.SGDRegressor(max_iter=1000,tol=0.001), svm.SVR(),KNeighborsRegressor(), RadiusNeighborsRegressor(radius=1.5),GaussianProcessRegressor()] list1reg=['LinearRegression','Ridge','RidgeCV', 'Lasso','LassoLarsCV','RANSACRegressor', 'BayesianRidge','ARDRegression','HuberRegressor', 'TheilSenRegressor','PLSRegression','DecisionTreeRegressor', 'ExtraTreeRegressor','BaggingRegressor','AdaBoostRegressor', 'GradientBoostingRegressor','RandomForestRegressor'] y1reg=[]; y7reg=[] for i in range(len(list1reg)): y1reg.append(regressor_fit_score(reg[i],list1reg[i],'Boston', X_train1,X_test1,y_train1,y_test1)[:2]) [[y_train101,y_test101],[y_train102,y_test102],[y_train103,y_test103], [y_train104,y_test104],[y_train105,y_test105],[y_train106,y_test106], [y_train107,y_test107],[y_train108,y_test108],[y_train109,y_test109], [y_train110,y_test110],[y_train111,y_test111],[y_train112,y_test112],
colour = sp.reshape(df.colour, (-1, 1)) #reshape the colour to a column vector for use in the algorithm designation = sp.array(df.designation.tolist()) temp = sp.array(df.teff.tolist()) """ possibly remove SVC, takes long time (~4 mins per fold) """ folds = 2 names = ['KNeighbours', 'Radius Neighbours', 'Random Forest Regressor', 'Linear Regression', 'Gaussian Process Regressor', 'Ada Boost Classifier'] classifiers = [KNeighborsRegressor(), RadiusNeighborsRegressor(), RandomForestRegressor(), LinearRegression(), GaussianProcessRegressor(), AdaBoostRegressor()] #load the random forest clssifier kf = cross_validation.KFold(n = len(colour), n_folds = folds, shuffle = True) #use kfolds to split the data final = [] MAD = [] for name, clf in zip(names, classifiers): ###importance = [] models = sp.array([[sp.nan]*len(temp)]*folds)
del globals()['profilesDF'] del globals()['profiles'] del globals()['profilesLSo'] del globals()['profilesLS'] del globals()['row'] del globals()['tmpLS'] del globals()['tmpAGE'] del globals()['profsTOlikes'] del globals()['i'] del globals()['tmpIND'] seed = 7 myRand = np.random.seed(seed) X_train, X_test, y_train, y_test = train_test_split(likesMAT, consARR, test_size=1500) myRAD = float(sys.argv[1]) radNN = RadiusNeighborsRegressor(radius=myRAD) #radNN.fit(likesMAT, consARR) radNN.fit(X_train, y_train) y_pred = radNN.predict(X_test) import math myRMSE = math.sqrt(metrics.mean_squared_error(y_test, y_pred)) print("cons, Radius neighbors: ", str(myRAD), " ", myRMSE) # joblib.dump(radNN, "/Users/jamster/radNN-A-cons.xz", compress=9) # impRadNN = joblib.load("/Users/jamster/radNN-A-cons.xz")
def RNN_Build(self, train_x, train_y): """RNN_Build""" self.rneigh = RadiusNeighborsRegressor(radius=1.0) self.rneigh.fit(train_x, train_y)
linear_model.ARDRegression(), linear_model.HuberRegressor(max_iter=800), linear_model.TheilSenRegressor(max_iter=800), PLSRegression(), DecisionTreeRegressor(), ExtraTreeRegressor(), BaggingRegressor(), AdaBoostRegressor(), GradientBoostingRegressor(), RandomForestRegressor(), linear_model.PassiveAggressiveRegressor(max_iter=800, tol=.001), linear_model.ElasticNet(max_iter=800), linear_model.SGDRegressor(max_iter=800, tol=.001), svm.SVR(), KNeighborsRegressor(), RadiusNeighborsRegressor(radius=1.5), GaussianProcessRegressor() ] listreg = [ 'LinearRegression', 'Ridge', 'RidgeCV', 'Lasso', 'LassoLarsCV', 'RANSACRegressor', 'BayesianRidge', 'ARDRegression', 'HuberRegressor', 'TheilSenRegressor', 'PLSRegression', 'DecisionTreeRegressor', 'ExtraTreeRegressor', 'BaggingRegressor', 'AdaBoostRegressor', 'GradientBoostingRegressor', 'RandomForestRegressor' ] yreg = [] for i in range(len(listreg)): yreg.append( regressor_fit_score(reg[i], listreg[i], 'Boston', x_train, x_test, y_train, y_test)[:2])
def estimate_aba_ge(self, entrez_ids, coords=None, **kwargs): """ Retrieves, estimates and stores gene expression coefficients in ABA dictionary based on a a passed list of NIH Entrez IDs. Parameters ---------- entrez_ids: List-like list-like structure containing NIH Entrez IDs. kwargs : dict, optional OPTIONS: 'rnn_args' : dict SKLearn RadiusNeighborsRegressor() optional arguments. http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.RadiusNeighborsRegressor.html for default arguments. """ self._check_entrez_struct(entrez_ids) for entrez_id in entrez_ids: # Fetch probe IDs for Entrez ID probe_ids = self._aba['probe_df'].loc[ self._aba['probe_df']['entrez_id'] == entrez_id]['probe_id'].tolist() if len(probe_ids) == 0: print 'Entrez ID: %s not registered with ABA database' % entrez_id continue # Return gene expression on given probes across sampled locations. ge_df = self._aba['exp_df'].loc[self._aba['exp_df'] ['probe_id'].isin(probe_ids)] ge_mat = ge_df.as_matrix().astype(float)[:, 1:] # Take average gene expression across probes at a given sampled location. ge_vec = np.mean(ge_mat, axis=0) self.ge[entrez_id] = {} for probe in probe_ids: self.ge[entrez_id][probe] = {} self.ge[entrez_id]["mean"] = {} # z scoring method if 'z_score' in kwargs: for row in xrange(ge_mat.shape[0]): ge_mat[row] = (ge_mat[row] - ge_mat[row].mean()) / ge_mat[row].std() ge_vec = (ge_vec - ge_vec.mean()) / ge_vec.std() if coords is None: for row, probe in enumerate(probe_ids): self.ge[entrez_id][probe]['GE'] = ge_mat[row] self.ge[entrez_id]["mean"]['GE'] = ge_vec self.ge[entrez_id]['coord_type'] = 'ABA' # Estimate gene expression at custom coordinates else: X = self._aba['mni_coords'].data y_mean = ge_vec valid_inds = self._check_coords_for_distance_weighting( coords=coords, check_radius=kwargs['rnn_args']['radius'], check_weights='distance', X=X, y_mean=y_mean) if 'rnn_args' in kwargs: if 'radius' not in kwargs['rnn_args']: kwargs['rnn_args']['radius'] = 5 if 'radius' in kwargs['rnn_args']: if kwargs['rnn_args']['radius'] == 1: kwargs['weights'] = 'uniform' if 'weights' not in kwargs['rnn_args']: kwargs['weights'] = 'uniform' if 'weights' != 'distance': self._gaussian_weight_radius = kwargs['rnn_args'][ 'radius'] for row, probe in enumerate(probe_ids): self.ge[entrez_id][probe][ 'classifier'] = RadiusNeighborsRegressor( **kwargs['rnn_args']) self.ge[entrez_id]["mean"][ 'classifier'] = RadiusNeighborsRegressor( **kwargs['rnn_args']) else: for row, probe in enumerate(probe_ids): self.ge[entrez_id][probe][ 'classifier'] = RadiusNeighborsRegressor( radius=5, weights='uniform') self.ge[entrez_id]["mean"][ 'classifier'] = RadiusNeighborsRegressor( radius=5, weights='uniform') for row, probe in enumerate(probe_ids): self.ge[entrez_id][probe]['classifier'].fit(X, ge_mat[row]) self.ge[entrez_id]["mean"]['classifier'].fit(X, y_mean) if 'store_coords' in kwargs: if kwargs['store_coords']: self.ge[entrez_id]['coords'] = coords if 'coord_type' in kwargs: self.ge[entrez_id]['coord_type'] = kwargs['coord_type'] else: self.ge[entrez_id]['coord_type'] = 'Custom' with warnings.catch_warnings(): warnings.simplefilter("ignore") nan_array = np.empty(len(coords)) nan_array[:] = np.nan for row, probe in enumerate(probe_ids): self.ge[entrez_id][probe]["GE"] = nan_array if len(valid_inds) > 0: estimations = self.ge[entrez_id][probe][ 'classifier'].predict( [coords[i] for i in valid_inds]) for vi in xrange(len(valid_inds)): self.ge[entrez_id][probe]["GE"][ valid_inds[vi]] = estimations[vi] self.ge[entrez_id]["mean"]["GE"] = nan_array if len(valid_inds) > 0: estimations = self.ge[entrez_id]["mean"][ 'classifier'].predict( [coords[i] for i in valid_inds]) for vi in xrange(len(valid_inds)): self.ge[entrez_id]["mean"]["GE"][vi] = estimations[ vi]
def process_data(input_true, input_reco, segment_label, group_label): """ arguments are Nx5 from processing data input_true: energy depositions input_reco: charge depositions segment_label: fivetypes label group_label: particle instance purpose is to get find M non-ghost reco voxels and set target energies for them based on blurring returns tuple of neural network inputs and other useful stuff (it's messy, sorry) element 0: [size Mx12] corresponding to input_reco (5) + one-hot encoded fivetypes+ghost (6) + blurred energy target (1) element 1: [size M] group label of voxel element 2: [size M] indices in input_true of voxels that have been reconstructed element 3: [size Mx5] input_true intersection with reco, where the last element in each row is blurred energy """ chosen_indices = [] chosen_reco_indices = [] current_batch = 0 current_batch_selection = np.where(input_true[:, -2] == current_batch)[0] current_input_true = input_true[current_batch_selection] for r in range(len(input_reco)): row = input_reco[r] b = row[-2] if b != current_batch: current_batch = b current_batch_selection = np.where( input_true[:, -2] == current_batch)[0] pos = row[:3] region_selection = np.where((current_input_true[:, 0] == pos[0]) & (current_input_true[:, 1] == pos[1]))[0] input_true_region = current_input_true[region_selection] for i in range(len(input_true_region)): row2 = input_true_region[i] pos2 = row2[:3] if np.array_equal(pos, pos2): chosen_indices.append( current_batch_selection[region_selection[i]]) chosen_reco_indices.append(r) break if len(chosen_indices) == 0: return None chosen_indices = np.array(chosen_indices) chosen_reco_indices = np.array(chosen_reco_indices) lost_data = np.delete(input_true, chosen_indices, axis=0) found_data = input_true[chosen_indices] # find where the chosen indices are in the group data lost_group_data = -np.ones((len(lost_data), len(lost_data[0]))) ungrouped_data = -np.ones((len(lost_data), len(lost_data[0]))) found_group_data = -np.ones((len(found_data), len(found_data[0]))) for i in range(len(lost_data)): row = lost_data[i] filter0 = group_label[np.where(group_label[:, -2] == row[-2])] filter1 = filter0[np.where(filter0[:, 0] == row[0])] filter2 = filter1[np.where(filter1[:, 1] == row[1])] filter3 = filter2[np.where(filter2[:, 2] == row[2])] if len(filter3) == 0: ungrouped_data[i] = row else: g = filter3[0] lost_group_data[i] = g for i in range(len(found_data)): row = found_data[i] filter0 = group_label[np.where(group_label[:, -2] == row[-2])] filter1 = filter0[np.where(filter0[:, 0] == row[0])] filter2 = filter1[np.where(filter1[:, 1] == row[1])] filter3 = filter2[np.where(filter2[:, 2] == row[2])] g = filter3[0] found_group_data[i] = g if ADD_MISSING_ENERGY: batches = np.unique(input_true[:, 3]) for b in batches: # nearest neighbor assignment within group found_groups = np.unique( found_group_data[np.where(found_group_data[:, 3] == b)][:, -1]) lost_batch_mask = lost_group_data[:, 3] == b found_batch_mask = found_group_data[:, 3] == b for g in found_groups: lost_selection = np.where(lost_batch_mask & (lost_group_data[:, -1] == g))[0] found_selection = np.where(found_batch_mask & (found_group_data[:, -1] == g))[0] ldata = lost_data[lost_selection] fdata = found_data[found_selection] lost_positions = ldata[:, :3] found_positions = fdata[:, :3] distances = distance_matrix(lost_positions, found_positions) closest_points = np.argmin(distances, axis=1) closest_energies = ldata[:, -1] for i in range(len(closest_points)): found_data[found_selection[ closest_points[i]]][-1] += closest_energies[i] # associated ungrouped voxels with nearest voxels, regardless of group lost_ungrouped = np.where((ungrouped_data[:, 3] == b))[0] if len(lost_ungrouped) > 0: found_selection = np.where(found_batch_mask)[0] ldata = lost_data[lost_ungrouped] fdata = found_data[found_selection] lost_positions = ldata[:, :3] found_positions = fdata[:, :3] distances = distance_matrix(lost_positions, found_positions) closest_points = np.argmin(distances, axis=1) closest_energies = ldata[:, -1] for i in range(len(closest_points)): found_data[found_selection[ closest_points[i]]][-1] += closest_energies[i] if BLUR_ENERGY: blur_kernel = 3 for g in np.unique(found_group_data[:, -1]): inds = np.where(found_group_data[:, -1] == g) selection = found_data[inds] total_energy = np.sum(selection[:, -1]) coords = selection[:, :3] energies = selection[:, -1] neigh = RadiusNeighborsRegressor(radius=blur_kernel) neigh.fit(coords, energies) selection[:, -1] = neigh.predict(coords) selection[:, -1] *= total_energy / np.sum(selection[:, -1]) found_data[inds, -1] = selection[:, -1] segment_indices = segment_label[chosen_indices, -1].astype(int) segment_one_hot = np.zeros((len(segment_indices), 5)) segment_one_hot[np.arange(len(segment_indices)), segment_indices] = 1 out = np.concatenate((input_reco[chosen_reco_indices], segment_one_hot, np.expand_dims(found_data[:, -1], axis=1)), axis=1) return np.array(out), found_group_data[:, -1], chosen_indices, found_data
# Read training dataset df = pd.read_csv(TRAINING_DATASET, header=None) # read from the first line columns = len(df.columns) rows = len(df.index) print 'Training dataset:', "{:,}".format(len(df.index)), 'x', "{:,}".format(len(df.columns)) df_y = df.ix[:,columns-1] df_x = df.ix[:,:columns-2] X = np.array(df_x) Y = np.array(df_y) neigh = RadiusNeighborsRegressor(radius = KNN_RADIUS) neigh.fit(X, Y) # Read Test dataset testFiles = [file for file in os.listdir(TEST_DATASET_DIRECTORY) if str(file).find('test') >= 0] print 'Number of test files:', len(testFiles) TEST_Y_ALL = np.array([]) TEST_Y_ALL_PREDICTED = np.array([]) for file in testFiles: df = pd.read_csv(TEST_DATASET_DIRECTORY + '/' + file, header=None) # read from the first line df_y = df.ix[:,columns-1] df_x = df.ix[:,:columns-2] X = np.array(df_x) Y = np.array(df_y)
def regress(X_train, y_train): # comment out any classifier that should not be used classifiers = [ (SGDRegressor(), "SGDRegressor", 1 * global_data_scale), (LinearRegression(), "LinearRegression", 1 * global_data_scale), (Ridge(), "Ridge", 1 * global_data_scale), (Lasso(), "Lasso", 1 * global_data_scale), (ElasticNet(), "ElasticNet", 1 * global_data_scale), (Lars(), "Lars", 1 * global_data_scale), (OrthogonalMatchingPursuit(), "OrthogonalMatchingPursuit", 1 * global_data_scale), (BayesianRidge(), "BayesianRidge", 1 * global_data_scale), (ARDRegression(), "ARDRegression", 1 * global_data_scale), ### NOTE the scoring might be different of PassiveAggressiveRegressor (PassiveAggressiveRegressor(), "PassiveAggressiveRegressor", 1 * global_data_scale), ### NOTE the scoring might be different of RANSACRegressor (RANSACRegressor(), "RANSACRegressor", 1 * global_data_scale), (TheilSenRegressor(), "TheilSenRegressor", 1 * global_data_scale), (HuberRegressor(), "HuberRegressor", 1 * global_data_scale), (DecisionTreeRegressor(), "DecisionTreeRegressor", 1 * global_data_scale), (GaussianProcessRegressor(), "GaussianProcessRegressor", 1 * global_data_scale), (MLPRegressor(), "MLPRegressor", 1 * global_data_scale), (KNeighborsRegressor(), "KNeighborsRegressor", 1 * global_data_scale), (RadiusNeighborsRegressor(), "RadiusNeighborsRegressor", 1 * global_data_scale), (SVR(), "SVR", 1 * global_data_scale), (NuSVR(), "NuSVR", 1 * global_data_scale), (LinearSVR(), "LinearSVR", 1 * global_data_scale), (KernelRidge(), "KernalRidge", 1 * global_data_scale), (IsotonicRegression(), "IsotonicRegression", 1 * global_data_scale) ] # set the list of the values that should be used in grid search params_dict = { "SGDRegressor": { "penalty": ["l2", "l1"], "alpha": [.001, .0001, .00001], "l1_ratio": [.15, .2, .25], "fit_intercept": [True, False], "max_iter": [1000], "shuffle": [True, False], "epsilon": [.05, .1, .2], "learning_rate": ["constant", "optimal", "invscaling", "adaptive"], "eta0": [.005, .01, .02], "power_t": [.2, .25, .3] }, "LinearRegression": { "fit_intercept": [True, False], "normalize": [True, False] }, "Ridge": { "alpha": [.8, 1., 1.2], "fit_intercept": [True, False], "normalize": [True, False], "tol": [.01, .001, .0001], "solver": ["svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"] }, "Lasso": { "alpha": [.8, 1., 1.2], "fit_intercept": [True, False], "normalize": [True, False], "positive": [True, False], "precompute": [True, False] }, "ElasticNet": { "alpha": [.8, 1., 1.2], "fit_intercept": [True, False], "normalize": [True, False], "precompute": [True, False], "positive": [True, False], "selection": ["cyclic", "random"] }, "Lars": { "fit_intercept": [True, False], "normalize": [True, False], "precompute": [True, False], "n_nonzero_coefs": [np.inf] }, "OrthogonalMatchingPursuit": { "n_nonzero_coefs": [np.inf, None], "precompute": [True, False], "fit_intercept": [True, False], "normalize": [True, False] }, "BayesianRidge": { "tol": [.01, .001, .0001], "alpha_1": [1e-5, 1e-6, 1e-7], "alpha_2": [1e-5, 1e-6, 1e-7], "lambda_1": [1e-5, 1e-6, 1e-7], "lambda_2": [1e-5, 1e-6, 1e-7], "fit_intercept": [True, False], "normalize": [True, False] }, "ARDRegression": { "tol": [.01, .001, .0001], "alpha_1": [1e-5, 1e-6, 1e-7], "alpha_2": [1e-5, 1e-6, 1e-7], "lambda_1": [1e-5, 1e-6, 1e-7], "lambda_2": [1e-5, 1e-6, 1e-7], "threshold_lambda": [1000, 10000, 100000], "fit_intercept": [True, False], "normalize": [True, False] }, "PassiveAggressiveRegressor": { "C": [.8, 1., 1.2 ], "tol": [1e-2, 1e-3, 1e-4], "n_iter_no_change": [3, 5, 8], "shuffle": [True, False], "average": [True, False] }, "RANSACRegressor": { "base_estimator": [LinearRegression()] }, "TheilSenRegressor": { "max_subpopulation": [1e3, 1e4, 1e5], "tol": [1e-2, 1e-3, 1e-4] }, "HuberRegressor": { "epsilon": [1.1, 1.35, 1.5], "alpha": [1e-3, 1e-4, 1e-5], "warm_start": [True, False], "fit_intercept": [True, False], "": [1e-4, 1e-5, 1e-6] }, "DecisionTreeRegressor": { "criterion": ["mse", "friedman_mse", "mae"], "splitter": ["best", "random"], "min_samples_split": [2, 3], "min_samples_leaf": [1, 2], "min_weight_fraction_leaf": [.0], "max_features": ["auto", "sqrt", "log2"], "min_impurity_split": [1e-6, 1e-7, 1e-8] }, "GaussianProcessRegressor": { "alpha": [1e-8, 1e-10, 1e-12], "optimizer": ["fmin_l_bfgs_b"], "normalize_y": [True, False] }, "MLPRegressor": { "hidden_layer_sizes": [(100,)], "activation": ["identity", "logistic", "tanh", "relu"], "solver": ["lbfgs", "sgd", "adam"], "alpha": [1e-3, 1e-4, 1e-5], # "learning_rate": ["constant", "invscaling", "adaptive"], # "learning_rate_init": [1e-2, 1e-3, 1e-4], # "power_t": [.3, .5, .8], # "shuffle": [True, False], # "tol": [1e-3, 1e-4, 1e-5], # "momentum": [.8, .9, .99], # "beta_1": [.8, .9, .99], # "beta_2": [.999], # "epsilon": [1e-7, 1e-8, 1e-9], # "n_iter_no_change": [10], # "max_fun": [15000] }, "KNeighborsRegressor": { "n_neighbors": [20, 10, 5, 3], "weights": ["uniform", "distance"], "algorithm": ["ball_tree", "kd_tree", "brute"], "leaf_size": [20, 30, 40], "p": [1, 2] }, "RadiusNeighborsRegressor": { "radius": [.8, 1, 1.2], "n_neighbors": [20, 10, 5, 3], "weights": ["uniform", "distance"], "algorithm": ["ball_tree", "kd_tree", "brute"], "leaf_size": [20, 30, 40], "p": [1, 2] }, "SVR": { "kernel": ["poly", "rbf", "sigmoid"], "degree": [2, 3, 5], "gamma": ["scale", "auto"], "coef0": [.0], "tol": [1e-2, 1e-3, 1e-4], "C": [.8, .1, 1.2], "epsilon": [.08, .1, .12], "shrinking": [True, False], "max_iter": [-1] }, "NuSVR": { "nu": [.2, .5, .8], "C": [.8, .1, 1.2], "kernel": ["poly", "rbf", "sigmoid"], "degree": [2, 3, 5], "gamma": ["scale", "auto"], "coef0": [.0], "shrinking": [True, False], "tol": [1e-2, 1e-3, 1e-4], "max_iter": [-1] }, "LinearSVR": { "epsilon": [.0], "tol": [1e-3, 1e-4, 1e-5], "C": [.8, .1, 1.2], "fit_intercept": [True, False], "dual": [True, False], "intercept_scaling": [.8, 1., 1.2] }, "KernelRidge": { "coef0": [.8, 1, 1.2], "degree": [2, 3, 5], }, "IsotonicRegression": { "increasing": [True, False], } } for model, params, frac in classifiers: full = pd.DataFrame(X_train).join(pd.DataFrame(y_train)) loan_data = full.sample(frac=frac, random_state=random_state) X = loan_data.drop("loan_status", axis=1) y = loan_data["loan_status"] grid = GridSearchCV(model, params_dict[params], verbose=verbose, cv=folds, n_jobs=workers) grid.fit(X, y) yield grid, params
def main(): # let's create a folder with a unique name to store results folderName = datetime.datetime.now().strftime( "%Y-%m-%d-%H-%M") + "-regression" if not os.path.exists(folderName): os.makedirs(folderName) # initialize logging common.initialize_logging(folderName) regressorsList = [ # human-designed regressors [ HumanRegressor("y = a_0 + a_1 * x + a_2 * x**2 + a_3 * x**3", map_variables_to_features={"x": 0}), "HumanRegressor" ], [PolynomialRegressor(2), "PolynomialRegressor2"], #[PolynomialRegressor(3), "PolynomialRegressor3"], # keras neural network #[ANNRegressor(epochs=500, batch_size=32, layers=[16,4]), "KerasRegressor8-4"], #[ANNRegressor(epochs=700, batch_size=32, layers=[16,8]), "KerasRegressor16-8"], # cross decomposition [PLSRegression(), "PLSRegression"], # ensemble [AdaBoostRegressor(), "AdaBoostRegressor"], [BaggingRegressor(), "BaggingRegressor"], [BaggingRegressor(n_estimators=100), "BaggingRegressor_100"], [BaggingRegressor(n_estimators=300), "BaggingRegressor_300"], [ExtraTreesRegressor(), "ExtraTreesRegressor"], [GradientBoostingRegressor(), "GradientBoostingRegressor"], [RandomForestRegressor(), "RandomForestRegressor"], [RandomForestRegressor(n_estimators=100), "RandomForestRegressor_100"], [RandomForestRegressor(n_estimators=300), "RandomForestRegressor_300"], # isotonic #[IsotonicRegression(), "IsotonicRegression"], # apparently wants "X" as a 1d array # kernel ridge [KernelRidge(), "KernelRidge"], # linear #[ARDRegression(), "ARDRegression"], # takes too much time to train [BayesianRidge(), "BayesianRidge"], [ElasticNetCV(), "ElasticNetCV"], [LarsCV(), "LarsCV"], [LassoCV(), "LassoCV"], [LinearRegression(), "LinearRegression"], [PassiveAggressiveRegressor(), "PassiveAggressiveRegressor"], # neighbors [KNeighborsRegressor(), "KNeighborsRegressor"], [RadiusNeighborsRegressor(), "RadiusNeighborsRegressor"], # neural networks #[BernoulliRBM(), "BernoulliRBM"], # has a different interface, no "predict" # svm [SVR(), "SVR"], [LinearSVR(), "LinearSVR"], [NuSVR(), "NuSVR"], # tree [DecisionTreeRegressor(), "DecisionTreeRegressor (max depth 10)"], [ExtraTreeRegressor(), "ExtraTreeRegressor"], # generalized additive models [LinearGAM(n_splines=20), "LinearGAM(n_splines=20)"], # gaussian processes [ GaussianProcessRegressor(kernel=DotProduct() + WhiteKernel()), "GaussianProcessRegressor" ], ] X = y = X_train = X_test = y_train = y_test = variablesX = variablesY = None numberOfSplits = 10 # TODO change number of splits from command line if True: # this is just a dumb benchmark X, y, variablesX, variablesY = common.loadEasyBenchmark() if False: X, y, variablesX, variablesY = common.loadChristianQuestionnaireRegression( ) if False: X, y, variablesX, variablesY = common.loadYongShiDataCalibration2( "TIMBER") if False: X, y, variablesX, variablesY = common.loadLaurentBouvierNewData() if False: X, y, variablesX, variablesY = common.loadYongShiDataCalibration() if False: from sklearn.datasets import load_linnerud X, y = load_linnerud(return_X_y=True) if False: X, y, variablesX, variablesY = common.loadYingYingData() if False: X, y, variablesX, variablesY = common.loadCleaningDataGermanSpecific() #X, y, variablesX, variablesY = common.loadCleaningDataGerman() if False: X, y, variablesX, variablesY = common.loadInsects() if False: X, y, variablesX, variablesY = common.loadMilkProcessPipesDimensionalAnalysis( ) #X, y, variablesX, variablesY = common.loadMilkProcessPipes() if False: # ecosystem services X, y, variablesX, variablesY = common.loadEcosystemServices() if False: X, y, variablesX, variablesY = common.loadMarcoSoil() if False: # load dataset X, y = common.loadEureqaRegression() # randomly split between training and test #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) if False: # load dataset X_train, X_test, y_train, y_test = common.loadBiscuitExample() logging.info("X_train: " + str(X_train.shape)) logging.info("X_test: " + str(X_test.shape)) logging.info("y_train: " + str(y_train.shape)) logging.info("y_test: " + str(y_test.shape)) # in this particular case, I create the "global" X and y by putting together the two arrays X = np.append(X_train, X_test, axis=0) y = np.append(y_train, y_test, axis=0) if False: # load dataset X_train, X_test, y_train, y_test = common.loadAromoptiExample() logging.info("X_train: " + str(X_train.shape)) logging.info("X_test: " + str(X_test.shape)) logging.info("y_train: " + str(y_train.shape)) logging.info("y_test: " + str(y_test.shape)) # in this particular case, I create the "global" X and y by putting together the two arrays X = np.append(X_train, X_test, axis=0) y = np.append(y_train, y_test, axis=0) logging.info( "Regressing %d output variables, in function of %d input variables..." % (y.shape[1], X.shape[1])) # if the names of the variables are not specified, let's specify them! if variablesY is None: variablesY = ["y" + str(i) for i in range(0, len(y[0]))] if variablesX is None: variablesX = ["X" + str(i) for i in range(0, len(X[0]))] performances = dict() for variableIndex, variableY in enumerate(variablesY): logging.info("** Now evaluating models for variable \"%s\"... **" % variableY) # obtain data y_ = y[:, variableIndex].ravel() # assume here that you will have train/test indexes instead # it's also easier for the plots, as we do not face the issue # of duplicate values (e.g. same value with two indexes) rs = ShuffleSplit(n_splits=numberOfSplits, random_state=42) #rs = LeaveOneOut() # initialize performance dictionary of arrays performances[variableY] = dict() for regressor, regressorName in regressorsList: performances[variableY][regressorName] = dict() performances[variableY][regressorName]["r^2"] = [] performances[variableY][regressorName]["e.v"] = [] performances[variableY][regressorName]["mse"] = [] performances[variableY][regressorName]["mae"] = [] performances[variableY][regressorName]["predicted"] = [] # this is used to store all values of each fold, in order; maybe there's a smarter way to do it foldPointsInOrder = [] # and now, for every regressor for foldIndex, indexes in enumerate(rs.split(X)): train_index, test_index = indexes X_train = X[train_index] y_train = y_[train_index] X_test = X[test_index] y_test = y_[test_index] # normalize logging.info("Normalizing data...") scalerX = StandardScaler() scalerY = StandardScaler() X_train = scalerX.fit_transform(X_train) X_test = scalerX.transform(X_test) y_train = scalerY.fit_transform(y_train.reshape(-1, 1)).ravel( ) # this "reshape/ravel" here is just to avoid warnings, it has no true effect on data y_test = scalerY.transform(y_test.reshape(-1, 1)).ravel() # now, we store points of the folder in order of how they appear foldPointsInOrder.extend(list(scalerY.inverse_transform(y_test))) for regressorIndex, regressorData in enumerate(regressorsList): regressor = regressorData[0] regressorName = regressorData[1] logging.info("Fold #%d/%d: training regressor #%d/%d \"%s\"" % (foldIndex + 1, numberOfSplits, regressorIndex + 1, len(regressorsList), regressorName)) try: regressor.fit(X_train, y_train) y_test_predicted = regressor.predict(X_test) r2Test = r2_score(y_test, y_test_predicted) mseTest = mean_squared_error(y_test, y_test_predicted) maeTest = mean_absolute_error(y_test, y_test_predicted) varianceTest = explained_variance_score( y_test, y_test_predicted) logging.info("R^2 score (test): %.4f" % r2Test) logging.info("EV score (test): %.4f" % varianceTest) logging.info("MSE score (test): %.4f" % mseTest) logging.info("MAE score (test): %.4f" % maeTest) # add performance to the list of performances performances[variableY][regressorName]["r^2"].append( r2Test) performances[variableY][regressorName]["e.v"].append( varianceTest) performances[variableY][regressorName]["mse"].append( mseTest) performances[variableY][regressorName]["mae"].append( maeTest) # also record the predictions, to be used later in a global figure performances[variableY][regressorName]["predicted"].extend( list(scalerY.inverse_transform(y_test_predicted))) try: import matplotlib.pyplot as plt # plotting first figure, with points 'x' and 'o' y_predicted = regressor.predict(scalerX.transform( X)) # 'X' was never wholly rescaled before y_train_predicted = regressor.predict(X_train) plt.figure() plt.scatter(train_index, y_train, c="gray", label="training data") plt.scatter(test_index, y_test, c="green", label="test data") plt.plot(np.arange(len(y_predicted)), y_predicted, 'x', c="red", label="regression") plt.xlabel("order of data samples") plt.ylabel("target") plt.title(regressorName + ", R^2=%.4f (test)" % r2Test) plt.legend() logging.info("Saving figure...") plt.savefig( os.path.join( folderName, regressorName + "-" + variableY + "-fold-" + str(foldIndex + 1) + ".pdf")) plt.close() # plotting second figure, with everything close to a middle line plt.figure() plt.plot(y_train, y_train_predicted, 'r.', label="training set") # points plt.plot(y_test, y_test_predicted, 'go', label="test set") # points plt.plot([ min(y_train.min(), y_test.min()), max(y_train.max(), y_test.max()) ], [ min(y_train_predicted.min(), y_test_predicted.min()), max(y_train_predicted.max(), y_test_predicted.max()) ], 'k--') # line plt.xlabel("measured") plt.ylabel("predicted") plt.title(regressorName + " measured vs predicted, " + variableY) plt.legend(loc='best') plt.savefig( os.path.join( folderName, regressorName + "-" + variableY + "-fold-" + str(foldIndex + 1) + "-b.pdf")) plt.close() # also, save ordered list of features featuresByImportance = relativeFeatureImportance( regressor) # if list exists, write feature importance to disk # TODO horrible hack here, to avoid issues with GAM if len(featuresByImportance ) > 0 and "GAM" not in regressorName: featureImportanceFileName = regressorName + "-" + variableY + "-featureImportance-fold" + str( foldIndex) + ".csv" with open( os.path.join(folderName, featureImportanceFileName), "w") as fp: fp.write("feature,importance\n") for featureImportance, featureIndex in featuresByImportance: fp.write(variablesX[int(featureIndex)] + "," + str(featureImportance) + "\n") except ImportError: logging.info( "Cannot import matplotlib. Skipping plots...") except Exception as e: logging.info("Regressor \"" + regressorName + "\" failed on variable \"" + variableY + "\":" + str(e)) logging.info("Final summary:") with open(os.path.join(folderName, "00_summary.txt"), "w") as fp: for variableY in variablesY: logging.info("For variable \"" + variableY + "\"") fp.write("For variable: " + variableY + " = f(" + variablesX[0]) for i in range(1, len(variablesX)): fp.write("," + variablesX[i]) fp.write(")\n") # create a list from the dictionary and sort it sortedPerformances = sorted( [(performances[variableY][regressorName], regressorName) for regressorName in performances[variableY]], key=lambda x: np.mean(x[0]["r^2"]), reverse=True) for regressorData in sortedPerformances: regressorName = regressorData[1] regressorScore = regressorData[0] r2Mean = np.mean(regressorScore["r^2"]) r2std = np.std(regressorScore["r^2"]) varianceMean = np.mean(regressorScore["e.v"]) varianceStd = np.std(regressorScore["e.v"]) mseMean = np.mean(regressorScore["mse"]) mseStd = np.std(regressorScore["mse"]) maeMean = np.mean(regressorScore["mae"]) maeStd = np.std(regressorScore["mae"]) logging.info( "\t- %s, R^2=%.4f (std=%.4f), Explained Variance=%.4f (std=%.4f), MSE=%.4f (std=%.4f), MAE=%.4f (std=%.4f)" % (regressorName, r2Mean, r2std, varianceMean, varianceStd, mseMean, mseStd, maeMean, maeStd)) fp.write( "\t- %s, R^2=%.4f (std=%.4f), Explained Variance=%.4f (std=%.4f), MSE=%.4f (std=%.4f), MAE=%.4f (std=%.4f)\n" % (regressorName, r2Mean, r2std, varianceMean, varianceStd, mseMean, mseStd, maeMean, maeStd)) fp.write("\t\t- R^2:" + str(["%.4f" % x for x in regressorScore["r^2"]]) + "\n") fp.write("\t\t- E.V.:" + str(["%.4f" % x for x in regressorScore["e.v"]]) + "\n") fp.write("\t\t- MSE:" + str(["%.4f" % x for x in regressorScore["mse"]]) + "\n") fp.write("\t\t- MAE:" + str(["%.4f" % x for x in regressorScore["mae"]]) + "\n") # also, plot a "global" graph # issue here, if a regressor fails, you have incongruent matrixes: a check is in order # TODO also, the plot looks really bad if some values are negative; turn everything to absolute values? if len(foldPointsInOrder) == len(regressorScore["predicted"]): fig = plt.figure() ax = fig.add_subplot(111) #bottom_left_corner = [min(foldPointsInOrder), max(foldPointsInOrder)] #top_right_corner = [min(regressorScore["predicted"]), max(regressorScore["predicted"])] x_bottom_top = [0, max(foldPointsInOrder)] y_bottom_top = [0, max(foldPointsInOrder)] ax.plot(foldPointsInOrder, regressorScore["predicted"], 'g.') # points ax.plot(x_bottom_top, y_bottom_top, 'k--', label="1:1") # line ax.plot(x_bottom_top, [y_bottom_top[0] * 1.20, y_bottom_top[1] * 1.20], 'r--', label="20% error") ax.plot(x_bottom_top, [y_bottom_top[0] * 0.80, y_bottom_top[1] * 0.80], 'r--') ax.set_title(regressorName + " measured vs predicted, " + variableY + " (all test)") ax.set_xlabel("measured") ax.set_ylabel("predicted") ax.legend(loc='best') plt.savefig( os.path.join( folderName, regressorName + "-" + variableY + "-global-b.png")) plt.close(fig)
parameters = ['teff', 'logg', 'feh'] names = [ 'KNeighbours', 'Radius Neighbors', 'Random Forest', 'Linear Regression', 'Gaussian Process', 'Ada Boost', 'Huber', 'RANSAC', 'Theil-Sen', ] classifiers = [ KNeighborsRegressor(), RadiusNeighborsRegressor(), RandomForestRegressor(), LinearRegression(), GaussianProcessRegressor(), AdaBoostRegressor(), HuberRegressor(), RANSACRegressor(), TheilSenRegressor() ] for parameter in parameters: print(parameter) y_train = train[parameter].tolist() y_test = test[parameter].tolist() ends = [sp.amin(y_test), sp.amax(y_test)]
x_train, x_test, y_train, y_test = train_test_split(x, y) from sklearn.neighbors import KNeighborsRegressor KNN_reg = KNeighborsRegressor(n_neighbors=6, weights='uniform') KNN_reg.fit(x_train, y_train) y_predict_knn = KNN_reg.predict(x_test) y_predict_knn[0:10] from sklearn.neighbors import RadiusNeighborsRegressor RNN_reg = RadiusNeighborsRegressor(radius=x_train.std()) RNN_reg.fit(x_train, y_train) y_predict_rnn = RNN_reg.predict(x_test) y_predict_rnn[0:10] RNN_reg = RadiusNeighborsRegressor() RNN_reg.fit(x_train, y_train) RNN_reg.predict(x_test) from sklearn.metrics import mean_absolute_error, mean_squared_error
def GetAllModelsForComparison(X_train, Y_train): models = { 'ARDRegression': ARDRegression(), 'BayesianRidge': BayesianRidge(), 'ElasticNet': ElasticNet(), 'ElasticNetCV': ElasticNetCV(), 'Hinge': Hinge(), #'Huber': Huber(), 'HuberRegressor': HuberRegressor(), 'Lars': Lars(), 'LarsCV': LarsCV(), 'Lasso': Lasso(), 'LassoCV': LassoCV(), 'LassoLars': LassoLars(), 'LassoLarsCV': LassoLarsCV(), 'LinearRegression': LinearRegression(), 'Log': Log(), 'LogisticRegression': LogisticRegression(), 'LogisticRegressionCV': LogisticRegressionCV(), 'ModifiedHuber': ModifiedHuber(), 'MultiTaskElasticNet': MultiTaskElasticNet(), 'MultiTaskElasticNetCV': MultiTaskElasticNetCV(), 'MultiTaskLasso': MultiTaskLasso(), 'MultiTaskLassoCV': MultiTaskLassoCV(), 'OrthogonalMatchingPursuit': OrthogonalMatchingPursuit(), 'OrthogonalMatchingPursuitCV': OrthogonalMatchingPursuitCV(), 'PassiveAggressiveClassifier': PassiveAggressiveClassifier(), 'PassiveAggressiveRegressor': PassiveAggressiveRegressor(), 'Perceptron': Perceptron(), 'RANSACRegressor': RANSACRegressor(), #'RandomizedLasso': RandomizedLasso(), #'RandomizedLogisticRegression': RandomizedLogisticRegression(), 'Ridge': Ridge(), 'RidgeCV': RidgeCV(), 'RidgeClassifier': RidgeClassifier(), 'SGDClassifier': SGDClassifier(), 'SGDRegressor': SGDRegressor(), 'SquaredLoss': SquaredLoss(), 'TheilSenRegressor': TheilSenRegressor(), 'BaseEstimator': BaseEstimator(), 'ClassifierMixin': ClassifierMixin(), 'LinearClassifierMixin': LinearClassifierMixin(), 'LinearDiscriminantAnalysis': LinearDiscriminantAnalysis(), 'QuadraticDiscriminantAnalysis': QuadraticDiscriminantAnalysis(), 'StandardScaler': StandardScaler(), 'TransformerMixin': TransformerMixin(), 'BaseEstimator': BaseEstimator(), 'KernelRidge': KernelRidge(), 'RegressorMixin': RegressorMixin(), 'LinearSVC': LinearSVC(), 'LinearSVR': LinearSVR(), 'NuSVC': NuSVC(), 'NuSVR': NuSVR(), 'OneClassSVM': OneClassSVM(), 'SVC': SVC(), 'SVR': SVR(), 'SGDClassifier': SGDClassifier(), 'SGDRegressor': SGDRegressor(), #'BallTree': BallTree(), #'DistanceMetric': DistanceMetric(), #'KDTree': KDTree(), 'KNeighborsClassifier': KNeighborsClassifier(), 'KNeighborsRegressor': KNeighborsRegressor(), 'KernelDensity': KernelDensity(), #'LSHForest': LSHForest(), 'LocalOutlierFactor': LocalOutlierFactor(), 'NearestCentroid': NearestCentroid(), 'NearestNeighbors': NearestNeighbors(), 'RadiusNeighborsClassifier': RadiusNeighborsClassifier(), 'RadiusNeighborsRegressor': RadiusNeighborsRegressor(), #'GaussianProcess': GaussianProcess(), 'GaussianProcessRegressor': GaussianProcessRegressor(), 'GaussianProcessClassifier': GaussianProcessClassifier(), 'CCA': CCA(), 'PLSCanonical': PLSCanonical(), 'PLSRegression': PLSRegression(), 'PLSSVD': PLSSVD(), #'ABCMeta': ABCMeta(), #'BaseDiscreteNB': BaseDiscreteNB(), 'BaseEstimator': BaseEstimator(), #'BaseNB': BaseNB(), 'BernoulliNB': BernoulliNB(), 'ClassifierMixin': ClassifierMixin(), 'GaussianNB': GaussianNB(), 'LabelBinarizer': LabelBinarizer(), 'MultinomialNB': MultinomialNB(), 'DecisionTreeClassifier': DecisionTreeClassifier(), 'DecisionTreeRegressor': DecisionTreeRegressor(), 'ExtraTreeClassifier': ExtraTreeClassifier(), 'AdaBoostClassifier': AdaBoostClassifier(), 'AdaBoostRegressor': AdaBoostRegressor(), 'BaggingClassifier': BaggingClassifier(), 'BaggingRegressor': BaggingRegressor(), #'BaseEnsemble': BaseEnsemble(), 'ExtraTreesClassifier': ExtraTreesClassifier(), 'ExtraTreesRegressor': ExtraTreesRegressor(), 'GradientBoostingClassifier': GradientBoostingClassifier(), 'GradientBoostingRegressor': GradientBoostingRegressor(), 'IsolationForest': IsolationForest(), 'RandomForestClassifier': RandomForestClassifier(), 'RandomForestRegressor': RandomForestRegressor(), 'RandomTreesEmbedding': RandomTreesEmbedding(), #'VotingClassifier': VotingClassifier(), 'BaseEstimator': BaseEstimator(), 'ClassifierMixin': ClassifierMixin(), 'LabelBinarizer': LabelBinarizer(), 'MetaEstimatorMixin': MetaEstimatorMixin(), #'OneVsOneClassifier': OneVsOneClassifier(), #'OneVsRestClassifier': OneVsRestClassifier(), #'OutputCodeClassifier': OutputCodeClassifier(), 'Parallel': Parallel(), #'ABCMeta': ABCMeta(), 'BaseEstimator': BaseEstimator(), #'ClassifierChain': ClassifierChain(), 'ClassifierMixin': ClassifierMixin(), 'MetaEstimatorMixin': MetaEstimatorMixin(), #'MultiOutputClassifier': MultiOutputClassifier(), #'MultiOutputEstimator': MultiOutputEstimator(), #'MultiOutputRegressor': MultiOutputRegressor(), 'Parallel': Parallel(), 'RegressorMixin': RegressorMixin(), 'LabelPropagation': LabelPropagation(), 'LabelSpreading': LabelSpreading(), 'BaseEstimator': BaseEstimator(), 'IsotonicRegression': IsotonicRegression(), 'RegressorMixin': RegressorMixin(), 'TransformerMixin': TransformerMixin(), 'BernoulliRBM': BernoulliRBM(), 'MLPClassifier': MLPClassifier(), 'MLPRegressor': MLPRegressor() } return models
from sklearn.naive_bayes import GaussianNB from sklearn.naive_bayes import MultinomialNB # sklearn NO random forest KAIKI lr = LinearRegression() dtr = DecisionTreeRegressor() rfr = RandomForestRegressor() rte = RandomTreesEmbedding() mr = MLPRegressor(max_iter=1000) omp = OrthogonalMatchingPursuit() rr = RANSACRegressor() tsr = TheilSenRegressor() br = BayesianRidge(n_iter=300, tol=0.001) bgm = BayesianGaussianMixture() knr = KNeighborsRegressor(n_neighbors=5) rnr = RadiusNeighborsRegressor(radius=1.0) pr = PLSRegression() gnb = GaussianNB() mnb = MultinomialNB() # estimators = {'LR ':lr,'DTR':dtr,'RFR':rfr,'MR ':mr} # estimators = {'LR ':lr,'DTR':dtr,'RFR':rfr,'OMP':omp,'RR ':rr, 'BR ':br,'BGM':bgm ,'KNR':knr,'RNR':rnr,'PR ':pr} estimators = { 'LR ': lr, 'DTR': dtr, 'RFR': rfr, 'OMP': omp, 'RR ': rr, 'BR ': br, 'KNR': knr, 'PR ': pr }