def test_scaler_without_centering(): rng = np.random.RandomState(42) X = rng.randn(4, 5) X[:, 0] = 0.0 # first feature is always of zero X_csr = sparse.csr_matrix(X) X_csc = sparse.csc_matrix(X) assert_raises(ValueError, StandardScaler().fit, X_csr) null_transform = StandardScaler(with_mean=False, with_std=False, copy=True) X_null = null_transform.fit_transform(X_csr) assert_array_equal(X_null.data, X_csr.data) X_orig = null_transform.inverse_transform(X_null) assert_array_equal(X_orig.data, X_csr.data) scaler = StandardScaler(with_mean=False).fit(X) X_scaled = scaler.transform(X, copy=True) assert_false(np.any(np.isnan(X_scaled))) scaler_csr = StandardScaler(with_mean=False).fit(X_csr) X_csr_scaled = scaler_csr.transform(X_csr, copy=True) assert_false(np.any(np.isnan(X_csr_scaled.data))) scaler_csc = StandardScaler(with_mean=False).fit(X_csc) X_csc_scaled = scaler_csr.transform(X_csc, copy=True) assert_false(np.any(np.isnan(X_csc_scaled.data))) assert_equal(scaler.mean_, scaler_csr.mean_) assert_array_almost_equal(scaler.std_, scaler_csr.std_) assert_equal(scaler.mean_, scaler_csc.mean_) assert_array_almost_equal(scaler.std_, scaler_csc.std_) assert_array_almost_equal( X_scaled.mean(axis=0), [0., -0.01, 2.24, -0.35, -0.78], 2) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis0(X_csr_scaled) assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0)) assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0)) # Check that X has not been modified (copy) assert_true(X_scaled is not X) assert_true(X_csr_scaled is not X_csr) X_scaled_back = scaler.inverse_transform(X_scaled) assert_true(X_scaled_back is not X) assert_true(X_scaled_back is not X_scaled) assert_array_almost_equal(X_scaled_back, X) X_csr_scaled_back = scaler_csr.inverse_transform(X_csr_scaled) assert_true(X_csr_scaled_back is not X_csr) assert_true(X_csr_scaled_back is not X_csr_scaled) assert_array_almost_equal(X_csr_scaled_back.toarray(), X) X_csc_scaled_back = scaler_csr.inverse_transform(X_csc_scaled.tocsc()) assert_true(X_csc_scaled_back is not X_csc) assert_true(X_csc_scaled_back is not X_csc_scaled) assert_array_almost_equal(X_csc_scaled_back.toarray(), X)
class PoissonRegression(Regressor): """ calulate the solution using the Newton-Raphson formula(second order optimization). This method has a advantage that its weight update rule needs no learning rate alpha. And it convages quickly. """ def __init__( self, features = range(231) ): Regressor.__init__(self) self.features = features self.weights = np.ones(len(features)) self.xscaler = StandardScaler() self.yscaler = StandardScaler() def learn(self, Xtrain, ytrain): Xless = Xtrain[:, self.features] self.xscaler.fit(Xless) Xless = self.xscaler.transform(Xless) self.yscaler.fit(ytrain) ytrain = self.yscaler.transform(ytrain) itertimes = 20 for i in range(itertimes): c = np.exp(np.dot(Xless, self.weights)) gradient = np.dot(Xless.T, (ytrain - c)) neg_hessian = np.dot(Xless.T, np.dot(np.diag(c), Xless)) self.weights = self.weights + np.dot(np.linalg.inv(neg_hessian), gradient) def predict(self, Xtest): Xless = Xtest[:, self.features] Xless = self.xscaler.transform(Xless) ytest = np.exp(np.dot(Xless, self.weights)) ytest = self.yscaler.inverse_transform(ytest) return ytest
def knn_max_density(self, X, n_neighbors, step): ss = StandardScaler() ss.fit(X) X_standart = ss.transform(X) passed_points_indeces = range(len(X_standart)) X_passed_standart = X_standart while len(X_passed_standart) > n_neighbors: knn = NearestNeighbors(n_neighbors=n_neighbors, leaf_size=100) knn.fit(X_passed_standart) knn_dists, knn_indeces = knn.kneighbors() knn_dists_mean = knn_dists.mean(axis=1) n_points = max(1, int(step * len(X_passed_standart))) passed_points_indeces = knn_dists_mean.argsort()[:-n_points] knn_dists_mean.sort() X_passed_standart = X_passed_standart[passed_points_indeces] X_passed = ss.inverse_transform(X_passed_standart) return X_passed
def background_model(x_train, method='mean', n_components=10): """ use data from x_train to create a model/image of the background :param x_train: a matrix with 1 row per image frame, each column represents a pixel PCA is trained on this data :return: a vector that represents the background image """ # clean the data before pca and clustering (subtract mean, divide by st. dev.) scaler = StandardScaler().fit(x_train) x_train = scaler.transform(x_train) # use SVD instead of PCA, so that don't need to compute covariance eig = TruncatedSVD(n_components=n_components).fit(x_train) print sum(eig.explained_variance_ratio_) train = eig.transform(x_train) # define background as an aggregation of each pixel value in the principal component space # can't see much of a difference between mean and median if method == 'median': back_pca = np.median(train, axis=0) elif method == 'mean': back_pca = np.mean(train, axis=0) else: print "method must either be 'median' or 'mean'" return 1 # transform to full sized matrix back_vec = eig.inverse_transform(back_pca) # add mean and variance back in back_vec = scaler.inverse_transform(back_vec) return back_vec
def test_scaler_1d(): """Test scaling of dataset along single axis""" rng = np.random.RandomState(0) X = rng.randn(5) X_orig_copy = X.copy() scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=False) assert_array_almost_equal(X_scaled.mean(axis=0), 0.0) assert_array_almost_equal(X_scaled.std(axis=0), 1.0) # check inverse transform X_scaled_back = scaler.inverse_transform(X_scaled) assert_array_almost_equal(X_scaled_back, X_orig_copy) # Test with 1D list X = [0., 1., 2, 0.4, 1.] scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=False) assert_array_almost_equal(X_scaled.mean(axis=0), 0.0) assert_array_almost_equal(X_scaled.std(axis=0), 1.0) X_scaled = scale(X) assert_array_almost_equal(X_scaled.mean(axis=0), 0.0) assert_array_almost_equal(X_scaled.std(axis=0), 1.0)
def get_track_params(self, X): ss = StandardScaler() ss.fit(X) transformed_tracks = ss.transform(X).mean(axis=0) tracks = ss.inverse_transform(transformed_tracks) return tracks, X.std(axis=0)
class GmmInterest(InterestModel): def __init__(self, conf, expl_dims, measure, n_samples=40, n_components=6, update_frequency=10): InterestModel.__init__(self, expl_dims) self.measure = measure self.bounds = conf.bounds[:, expl_dims] self.n_components = n_components self.scale_t = 1 # 1. / n_samples self.t = -self.scale_t * n_samples self.scale_x = conf.bounds[1, expl_dims] - conf.bounds[0, expl_dims] self.scale_measure = abs(measure(numpy.zeros_like(conf.bounds[0, :]), numpy.zeros_like(conf.bounds[0]))) self.data = numpy.zeros((n_samples, len(expl_dims) + 2)) self.n_samples = n_samples self.scaler = StandardScaler() self.update_frequency = update_frequency for _ in range(n_samples): self.update(rand_bounds(conf.bounds), rand_bounds(conf.bounds)) def sample(self): x = self.gmm_choice.sample() x = self.scaler.inverse_transform(numpy.hstack(([0.0], x.flatten(), [0.0])))[1:-1] x = numpy.maximum(x, self.bounds[0, :]) x = numpy.minimum(x, self.bounds[1, :]) return x.T def update(self, xy, ms): measure = self.measure(xy, ms) self.data[self.t % self.n_samples, 0] = self.t self.data[self.t % self.n_samples, -1] = measure self.data[self.t % self.n_samples, 1:-1] = xy.flatten()[self.expl_dims] self.t += self.scale_t if self.t >= 0: if self.t % self.update_frequency == 0: self.update_gmm() return self.t, xy.flatten()[self.expl_dims], measure def update_gmm(self): scaled_data = self.scaler.fit_transform(self.data) self.gmm = GMM(n_components=self.n_components, covariance_type="full") self.gmm.fit(numpy.array(scaled_data)) self.gmm_choice = self.gmm_interest() def gmm_interest(self): cov_t_c = numpy.array([self.gmm.covars_[k, 0, -1] for k in range(self.gmm.n_components)]) cov_t_c = numpy.exp(cov_t_c) # cov_t_c[cov_t_c <= 1e-100] = 1e-100 gmm_choice = self.gmm.inference([0], range(1, len(self.expl_dims) + 1), [1.0]) gmm_choice.weights_ = cov_t_c gmm_choice.weights_ /= numpy.array(gmm_choice.weights_).sum() return gmm_choice
def main(): df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data', header = None, sep = '\s+') df.columns = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV'] print(df.head()) # Select a subset of the features and plot the correlation between features cols = ['LSTAT', 'INDUS', 'NOX', 'RM', 'MEDV'] sns.pairplot(df[cols], size=2.5); plt.title('Correlations between 5 features') plt.show() # Plot a heatmap of the same subset of features cm = np.corrcoef(df[cols].values.T) sns.set(font_scale=2.5) hm = sns.heatmap(cm, cbar = True, annot = True, square = True, fmt = '.2f', annot_kws = {'size': 15}, yticklabels = cols, xticklabels = cols) plt.show() X = df[['RM']].values y = df['MEDV'].values sc_x = StandardScaler() sc_y = StandardScaler() X_std = sc_x.fit_transform(X) y_std = sc_y.fit_transform(y) lr = LinearRegressionGD() lr.fit(X_std, y_std) plt.plot(range(1, lr.n_iter + 1), lr.cost_) plt.ylabel('SSE') plt.xlabel('Epoch') plt.show() lin_regplot(X_std, y_std, lr) plt.xlabel('Average number of rooms [RM] (standardized)') plt.ylabel('Price in $1000\'s [MEDV] (standardized)') plt.show() # Example classification for a house with 5 rooms num_rooms_std = sc_x.transform([5.0]) price_std = lr.predict(num_rooms_std) print("Price in $1000's: %.3f" % \ sc_y.inverse_transform(price_std))
def clusterThose(G,eps=0.1,min_samples=4): ''' Scale the data and cluster''' scaler = StandardScaler(copy=True) X_centered = scaler.fit(G).transform(G) db = DBSCAN(eps, min_samples).fit( X_centered ) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) X = scaler.inverse_transform(X_centered) return X, n_clusters_, labels, core_samples_mask
def kmeans_fitting(rows, train): x = get_feature_vector(rows, train) scaler = StandardScaler() scaler.fit(x) x = scaler.transform(x) model = cluster.MiniBatchKMeans(n_clusters = 6) model.fit(x) centers = model.cluster_centers_ print centers centers = scaler.inverse_transform(centers) print centers return model, scaler
def DAEGO(X_s,H,P,batch_range): """ Parameters ---------- X_s: small class features H : layers (first layers shoud have same neurons as number of features) P : percent oversampling batch_range : size of minibatch Returns ------- syn_Z: synthetic sample with same number of features as smaller class """ #normalization scaler=StdScaler() x_tr=scaler.fit_transform(X_s.astype(float)) x_norm=norm(x_tr,axis=0) n_samples=int(X_s.shape[0]*P/100) print "generating %d samples" %(n_samples) norm_param=[LA.norm(x) for x in x_tr.T] X_init=np.random.standard_normal(size=(n_samples,X_s.shape[1])) x_init_tr=scaler.transform(X_init) x_ini_norm=norm(x_init_tr) ae=autoencoder(dimensions=H) learning_rate = 0.001 optimizer = tf.train.AdamOptimizer(learning_rate).minimize(ae['cost']) sess = tf.Session() sess.run(tf.initialize_all_variables()) n_epoch=100 for epoch_i in range(n_epoch): for start, end in zip(range(0, len(x_norm), batch_range),range(batch_range, len(x_norm), batch_range)): input_ = x_norm[start:end] sess.run(optimizer, feed_dict={ae['x']: input_, ae['corrupt_prob']: [1.0]}) s="\r Epoch: %d Cost: %f"%(epoch_i, sess.run(ae['cost'], feed_dict={ae['x']: X_s, ae['corrupt_prob']: [1.0]})) stderr.write(s) stderr.flush() x_init_encoded = sess.run(ae['y'], feed_dict={ae['x']: x_ini_norm, ae['corrupt_prob']: [0.0]}) sess.close() x_init_norminv=np.multiply(x_init_encoded,norm_param) syn_Z=scaler.inverse_transform(x_init_norminv) return syn_Z
def test_scaler_without_centering(): rng = np.random.RandomState(42) X = rng.randn(4, 5) X[:, 0] = 0.0 # first feature is always of zero X_csr = sp.csr_matrix(X) scaler = StandardScaler(with_mean=False).fit(X) X_scaled = scaler.transform(X, copy=True) assert_false(np.any(np.isnan(X_scaled))) scaler_csr = StandardScaler(with_mean=False).fit(X_csr) X_csr_scaled = scaler_csr.transform(X_csr, copy=True) assert_false(np.any(np.isnan(X_csr_scaled.data))) assert_equal(scaler.mean_, scaler_csr.mean_) assert_array_almost_equal(scaler.std_, scaler_csr.std_) assert_array_almost_equal( X_scaled.mean(axis=0), [0., -0.01, 2.24, -0.35, -0.78], 2) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis0(X_csr_scaled) assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0)) assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0)) # Check that X has not been modified (copy) assert_true(X_scaled is not X) assert_true(X_csr_scaled is not X_csr) X_scaled_back = scaler.inverse_transform(X_scaled) assert_true(X_scaled_back is not X) assert_true(X_scaled_back is not X_scaled) assert_array_almost_equal(X_scaled_back, X) X_csr_scaled_back = scaler_csr.inverse_transform(X_csr_scaled) assert_true(X_csr_scaled_back is not X_csr) assert_true(X_csr_scaled_back is not X_csr_scaled) assert_array_almost_equal(X_scaled_back, X)
class InputScaler(): def __init__(self): self.scaler = StandardScaler() def fit_transform(self, data): flat = numpy.vstack(data) self.scaler.fit(flat) return [ self.scaler.transform(X) for X in data ] def transform(self, data): return [ self.scaler.transform(X) for X in data ] def inverse_transform(self, data): return [ self.scaler.inverse_transform(X) for X in data ]
def knn_max_density(self, X, n_neighbors, step): # ss = StandardScaler() # ss.fit(X) # X_standart = ss.transform(X) # # passed_points_indeces = range(len(X_standart)) # X_passed_standart = X_standart # # while len(X_passed_standart) > n_neighbors: # # knn = NearestNeighbors(n_neighbors=n_neighbors, leaf_size=100) # knn.fit(X_passed_standart) # knn_dists, knn_indeces = knn.kneighbors() # # knn_dists_mean = knn_dists.mean(axis=1) # # n_points = max(1, int(step * len(X_passed_standart))) # passed_points_indeces = knn_dists_mean.argsort()[:-n_points] # knn_dists_mean.sort() # # X_passed_standart = X_passed_standart[passed_points_indeces] # # X_passed = ss.inverse_transform(X_passed_standart) ss = StandardScaler() ss.fit(X) X_standart = ss.transform(X) passed_points_indeces = range(len(X_standart)) X_passed_standart = X_standart n_neighbors = min(n_neighbors, len(X_passed_standart) - 1) knn = NearestNeighbors(n_neighbors=n_neighbors, leaf_size=100) knn.fit(X_passed_standart) knn_dists, knn_indeces = knn.kneighbors() knn_dists_mean = knn_dists.mean(axis=1) max_dense_point = knn_dists_mean.argsort()[0] passed_points_indeces = list(knn_indeces[max_dense_point]) + [max_dense_point] X_passed_standart = X_passed_standart[passed_points_indeces] X_passed = ss.inverse_transform(X_passed_standart) return X_passed
def submit(args): """Run train-test experiment. """ data = load_data(args['--data']) X_train = data['X_train'] y_train = data['y_train'] X_test = data['X_test'] est = GradientBoostingRegressor(n_estimators=2000, verbose=1, max_depth=6, min_samples_leaf=9, learning_rate=0.02, max_features=33, random_state=1, subsample=1.0, loss='lad') model_cls = MODELS[args['<model>']] model = model_cls(est=est, with_stationinfo=True, with_date=True, with_solar=True, with_mask=True, intp_blocks=('nm_intp', 'nmft_intp', 'nm_intp_sigma'), ) print('_' * 80) print('Submit') print print model print print scaler = StandardScaler() if args['--scaley']: y_train = scaler.fit_transform(y_train.copy()) t0 = time() model.fit(X_train, y_train) print('model.fit took %.fm' % ((time() - t0) / 60.)) pred = model.predict(X_test) if args['--scaley']: pred = scaler.inverse_transform(pred) data = load_data(args['--data']) date_idx = data['X_test'].date date_idx = date_idx.map(lambda x: x.strftime('%Y%m%d')) stid = pd.read_csv('data/station_info.csv')['stid'] out = pd.DataFrame(index=date_idx, columns=stid, data=pred) out.index.name = 'Date' out.to_csv('hk_19.csv') IPython.embed()
def get_rbf_nn_prediction(train_data, train_truth, test_data, test_truth, centers=8, spread=1, iter_id=0): train_truth = train_truth[:,np.newaxis] test_truth = test_truth[:,np.newaxis] scaler = StandardScaler() train_truth = scaler.fit_transform(train_truth).ravel() test_truth = scaler.transform(test_truth).ravel() net = _get_nn(train_data.shape[1], spread=spread) _train_nn(net, train_data, train_truth, centers) out = net.activate_many(test_data) predicted = scaler.inverse_transform(np.array(out)) return predicted.ravel()
def test_scaler_2d_arrays(): """Test scaling of 2d array along first axis""" rng = np.random.RandomState(0) X = rng.randn(4, 5) X[:, 0] = 0.0 # first feature is always of zero scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=True) assert_false(np.any(np.isnan(X_scaled))) assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0]) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) # Check that X has been copied assert_true(X_scaled is not X) # check inverse transform X_scaled_back = scaler.inverse_transform(X_scaled) assert_true(X_scaled_back is not X) assert_true(X_scaled_back is not X_scaled) assert_array_almost_equal(X_scaled_back, X) X_scaled = scale(X, axis=1, with_std=False) assert_false(np.any(np.isnan(X_scaled))) assert_array_almost_equal(X_scaled.mean(axis=1), 4 * [0.0]) X_scaled = scale(X, axis=1, with_std=True) assert_false(np.any(np.isnan(X_scaled))) assert_array_almost_equal(X_scaled.mean(axis=1), 4 * [0.0]) assert_array_almost_equal(X_scaled.std(axis=1), 4 * [1.0]) # Check that the data hasn't been modified assert_true(X_scaled is not X) X_scaled = scaler.fit(X).transform(X, copy=False) assert_false(np.any(np.isnan(X_scaled))) assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0]) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) # Check that X has not been copied assert_true(X_scaled is X) X = rng.randn(4, 5) X[:, 0] = 1.0 # first feature is a constant, non zero feature scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=True) assert_false(np.any(np.isnan(X_scaled))) assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0]) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) # Check that X has not been copied assert_true(X_scaled is not X)
class RootTransform: def __init__(self, root=0.5): self.root = root def fit(self, X): if numpy.any(X < 0): raise ValueError("Log Transform: All values must be greater than or equal to zero") xlog = (X + 1e-10) ** self.root self.scale = StandardScaler().fit(xlog) def transform(self, X): return self.scale.transform((X + 1e-10) ** self.root) def inverse_transform(self, X): xinv = self.scale.inverse_transform(X) xinv = xinv ** (1 / self.root) - 1e-10 return xinv
class LogTransform: def __init__(self): pass def fit(self, X): if numpy.any(X < 0): raise ValueError("Log Transform: All values must be greater than or equal to zero") # xlog = numpy.log(X+1e-10) xlog = (X + 1e-10) ** 0.5 self.scale = StandardScaler().fit(xlog) def transform(self, X): return self.scale.transform(numpy.sqrt(X + 1e-10)) def inverse_transform(self, X): xinv = self.scale.inverse_transform(X) # xinv = numpy.exp(xinv)-1e-10 xinv = xinv ** 2 - 1e-10 return xinv
def classifyWithKmeans(num_clusters): client = MongoClient('localhost', 27017) db = client["pitchfx"] x = [] for player in db.players.find(): for year in range(2008, 2016): if player.get('h_%d' % year) == None or player.get('ab_%d' % year) < 100: continue x.append(kmeans_features(player, year)) kmeans = KMeans(init='k-means++', n_clusters=num_clusters, n_init=10, random_state=1000) vec = DictVectorizer() scaler = StandardScaler() scaler.fit(vec.fit_transform(x).toarray()) kmeans.fit(scaler.transform(vec.transform(x).toarray())) print json.dumps(vec.inverse_transform(scaler.inverse_transform(kmeans.cluster_centers_)), indent=4) for i in range(0,8): print 'cluster %d:' % i, list(kmeans.labels_).count(i) return (kmeans, scaler, vec)
class ImageScaler(object): """ Thin wrapper around sklearn.preprocessing.StandardScaler that works on image (and maintain their shapes). Doing per-channel scaling/centering """ def fit(self, img): """ Args: img: (width, height, nchans) """ self._scaler = StandardScaler().fit(img.reshape(-1, img.shape[2])) return self def transform(self, img): return self._scaler.transform(img.reshape(-1, img.shape[2])).reshape(*img.shape) def inverse_transform(self, img): return self._scaler.inverse_transform(img.reshape(-1, img.shape[2])).reshape(*img.shape) def __repr__(self): return 'ImageScaler(\n %s\n mean=%s\n std=%s\n)' % (self._scaler, self._scaler.mean_, self._scaler.std_)
def create_probability_grid(x_min, x_max, y_min, y_max, scaler: StandardScaler, prob_granularity: float = 0.001): """ Creates a np.meshgrid with (approximately) given granularity between values -5 and 5. Tries to keep the number of points reasonable. Also transforms the grid to longitude and latitude coordinates using given scaler. :param scaler: scaler to be used while transforming back to latitude and longitude :param prob_granularity: distance between each point in the grid (in degrees) :return: a tuple containing raw X, Y values and lat-lon vales """ x_min_unsc, y_min_unsc = scaler.inverse_transform((x_min, y_min)) x_max_unsc, y_max_unsc = scaler.inverse_transform((x_max, y_max)) x_rng = np.arange(x_min_unsc, x_max_unsc, prob_granularity) y_rng = np.arange(y_min_unsc, y_max_unsc, prob_granularity) while len(x_rng) > 400 or len(y_rng) > 400: # print("Too many points ({}x{}), decreasing granularity.".format(len(x_rng), len(y_rng))) prob_granularity *= 1.25 x_rng = np.arange(x_min_unsc, x_max_unsc, prob_granularity) y_rng = np.arange(y_min_unsc, y_max_unsc, prob_granularity) # print("Generated {}x{} coordinate points.".format(x_rng.shape[0], y_rng.shape[0])) X_lon, Y_lat = np.meshgrid(x_rng, y_rng) x = X_lon.ravel() y = Y_lat.ravel() coords = np.hstack((x[:, np.newaxis], y[:, np.newaxis])) scaled = scaler.transform(coords) X = scaled[:, 0].reshape(X_lon.shape) Y = scaled[:, 1].reshape(Y_lat.shape) return X, Y, X_lon, Y_lat
go.Scatter(x=test[time_steps:].date, y=test_score_df.loss, mode='lines', name='Test Loss')) fig.add_trace( go.Scatter(x=test[time_steps:].date, y=test_score_df.threshold, mode='lines', name='Threshold')) fig.update_layout(showlegend=True) fig.show() anomallies = test_score_df[test_score_df.anomaly == True] anomallies.head() fig = go.Figure() fig.add_trace( go.Scatter(x=test[time_steps:].date, y=scaler.inverse_transform(test[time_steps:].close), mode='lines', name='Close Price')) fig.add_trace( go.Scatter(x=anomallies.date, y=scaler.inverse_transform(anomallies.close), mode='markers', name='Anomaly')) fig.update_layout(showlegend=True) fig.show()
X=sc_X.fit_transform(X) sc_y=StandardScaler() y=sc_y.fit_transform(y) # Training the SVR model on the whole dataset from sklearn.svm import SVR regressor = SVR(kernel = 'rbf') # We can chose the kernel. Some kernel can learn linear and some non linear relationships regressor.fit(X, y) # Predicting a new result # lets say we want to predict y when X=6.5 X=6.5 X=[[6.5]] #first, we convert X into a 2-d array because regressor.predict function expect a 2-d array as input X=sc_X.transform(X) #second, we scale X that we want to predict because our model is built on scaled values of X which we did using sc_X scaler y=regressor.predict(X) #third, we predict the value of y; keep in mind that this predicted value of y is in the scale that was applied to y y=sc_y.inverse_transform(y) #fourth, we reverse the scaling that we applied on y using sc_y # Visualising the SVR results plt.scatter(sc_X.inverse_transform(X), sc_y.inverse_transform(y), color = 'red') # we show X and actual y; this will give us the real points in their original scale, we need to do this because we applied transforms to X and y earlier plt.plot(sc_X.inverse_transform(X), sc_y.inverse_transform(regressor.predict(X)), color = 'blue') # we show X and predicted y; we use the regressor function to predict y values and then the sc_y.inverse_transform to unscale them plt.title('Truth or Bluff (SVR)') plt.xlabel('Position level') plt.ylabel('Salary') plt.show() # Visualising the SVR results (for higher resolution and smoother curve) X_grid = np.arange(min(sc_X.inverse_transform(X)), max(sc_X.inverse_transform(X)), 0.1) X_grid = X_grid.reshape((len(X_grid), 1)) plt.scatter(sc_X.inverse_transform(X), sc_y.inverse_transform(y), color = 'red') plt.plot(X_grid, sc_y.inverse_transform(regressor.predict(sc_X.transform(X_grid))), color = 'blue') plt.title('Truth or Bluff (SVR)')
x = data.iloc[:, 1:-1].values y = data.iloc[:, -1].values y = y.reshape(len(y), 1) from sklearn.preprocessing import StandardScaler sc_x = StandardScaler() sc_y = StandardScaler() x = sc_x.fit_transform(x) y = sc_y.fit_transform(y) from sklearn.svm import SVR reg = SVR(kernel='rbf') # gaussian radial basis function kernel reg.fit(x, y) output = sc_y.inverse_transform(reg.predict(sc_x.transform([[6.5]]))) print(output) plt.scatter(sc_x.inverse_transform(x), sc_y.inverse_transform(y), color='red') plt.plot(sc_x.inverse_transform(x), sc_y.inverse_transform(reg.predict(x)), color='blue') plt.title('Experience vs Salary') plt.xlabel('Experience') plt.ylabel('Salary') plt.show() x_grid = np.arange(min(x), max(x), 0.1) x_grid = x_grid.reshape((len(x_grid), 1)) plt.scatter(x, y, color='red') plt.plot(x_grid, reg.predict(x_grid), color='blue')
class DNN(object): def __init__(self, num_layers_range: list = [1, 4, 10], use_dropout: bool = False, use_l2_regularization: bool = False): self.logger = logging.getLogger("AutoNet") self.num_layers_range = num_layers_range self.use_dropout = use_dropout self.use_l2_regularization = use_l2_regularization self.scalerX = StandardScaler() self.scalerY = StandardScaler() def fit(self, X, y, max_epochs: int, runcount_limit: int = 100, wc_limit: int = 60, config: Configuration = None, seed: int = 12345): X_all = None y_all = None for idx, (X_q, y_q) in enumerate(zip(X, y)): if idx == 0: X_all = X_q y_all = y_q else: X_all = np.vstack([X_all, X_q]) y_all = np.hstack([y_all, y_q]) def obj_func(config, instance=None, seed=None, pc=None): # continuing training if pc is given # otherwise, construct new DNN models = [] losses = [] for model_idx, [train_idx, valid_idx] in enumerate([[0, 3], [3, 0], [1, 2], [2, 1]]): X_train = X[train_idx] y_train = y[train_idx] X_train = self.scalerX.fit_transform(X_train) y_train = np.log10(y_train) y_train = self.scalerY.fit_transform(y_train.reshape(-1, 1))[:, 0] X_valid, y_valid = X_all, y_all X_valid = self.scalerX.transform(X_valid) y_valid = np.log10(y_valid) y_valid = self.scalerY.transform(y_valid.reshape(-1, 1))[:, 0] if pc is None: if model_idx == 0: K.clear_session() model = ParamFCNetRegression( config=config, n_feat=X_train.shape[1], expected_num_epochs=max_epochs, n_outputs=1, verbose=1) else: model = pc[model_idx] history = model.train(X_train=X_train, y_train=y_train, X_valid=X_valid, y_valid=y_valid, n_epochs=1) models.append(model) final_loss = history["val_loss"][-1] losses.append(final_loss) return np.mean(losses), {"model": models} taf = SimpleTAFunc(obj_func) cs = ParamFCNetRegression.get_config_space( num_layers_range=self.num_layers_range, use_l2_regularization=self.use_l2_regularization, use_dropout=self.use_dropout) print(cs) ac_scenario = Scenario({ "run_obj": "quality", # we optimize quality "runcount-limit": max_epochs * runcount_limit, "wallclock-limit": wc_limit, "cost_for_crash": 10, "cs": cs, "deterministic": "true", "abort_on_first_run_crash": False, "output-dir": "" }) intensifier = Intensifier(tae_runner=taf, stats=None, traj_logger=None, rng=np.random.RandomState(42), run_limit=100, max_epochs=max_epochs) if isinstance(config, dict): config = fix_types(configuration=dict, configuration_space=cs) config = Configuration(configuration_space=cs, values=config) elif runcount_limit == 1: config = cs.get_default_configuration() else: smac = SMAC(scenario=ac_scenario, tae_runner=taf, rng=np.random.RandomState(seed), intensifier=intensifier) smac.solver.runhistory.overwrite_existing_runs = True config = smac.optimize() print("Final Incumbent") print(config) X_all = self.scalerX.fit_transform(X_all) y_all = np.log10(y_all) y_all = self.scalerY.fit_transform(y_all.reshape(-1, 1))[:, 0] K.clear_session() start_time = time.time() model = ParamFCNetRegression(config=config, n_feat=X_all.shape[1], expected_num_epochs=max_epochs, n_outputs=1, verbose=1) history = model.train(X_train=X_all, y_train=y_all, X_valid=X_all, y_valid=y_all, n_epochs=max_epochs) print("Training Time: %f" % (time.time() - start_time)) self.model = model def predict(self, X_test): X_test = self.scalerX.transform(X_test) y_pred = self.model.predict(X_test) y_pred = self.scalerY.inverse_transform(y_pred) y_pred = 10**y_pred y_pred = np.maximum(0.0005, y_pred) return y_pred
# Splitting the dataset into the Training set and Test set """from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)""" # Feature Scaling # As SVR model is not handling feature scaling, we need to manually handle this # scenerio, we need to make train data and output data at same scale from sklearn.preprocessing import StandardScaler x_scaler = StandardScaler() y_scaler = StandardScaler() X = x_scaler.fit_transform(X) y = y_scaler.fit_transform(y.reshape(-1, 1)) # let us start with svm model ,which is very stright forward from sklearn.svm import SVR # we need to define kernal and default kernal is rbf only but we will define it for more clearity svr = SVR(kernel='rbf') # train model svr.fit(X, y) #let us predict value, but one thing is important that we need to transform input inputValue = 6.5 inputValueArr = np.array([[inputValue]]) predict = svr.predict(x_scaler.transform(inputValueArr)) predict = y_scaler.inverse_transform(predict) plt.scatter(X, y, color='red') plt.plot(X, svr.predict(X), color='green') plt.title('Prediction Based on SVR') plt.show()
X_train = X_normalizer.fit_transform(X_train) X_test = X_normalizer.transform(X_test) y_normalizer = StandardScaler() y_train = y_normalizer.fit_transform(y_train) y_test = y_normalizer.transform(y_test) # 创建KNN训练对象,K值设置为2 knn = KNeighborsRegressor(n_neighbors=2) # 开始训练 # ravel()将多维数据转为1*n的一维数据 knn.fit(X_train, y_train.ravel()) # 将测试数据传入训练好的knn对象进行预测 y_pred = knn.predict(X_test) # 反标准化,将标准化后的数据恢复成原始倍数 y_pred_inv = y_normalizer.inverse_transform(y_pred) y_test_inv = y_normalizer.inverse_transform(y_test) # 以预测值为横坐标,真实值为纵坐标画二维点图 plt.scatter(y_pred_inv, y_test_inv) plt.xlabel('Prediction') plt.ylabel('Real value') # 画出对称且经过原点的直线y=kx,用于区分蓝色点(预测值, 真实值)偏离100%正确多远 diagonal = np.linspace(500, 1500, 100) # 生成从500到1500,100个数据的等差数列 plt.plot(diagonal, diagonal, '-r') plt.xlabel('Predicted ask price') plt.ylabel('Ask price') plt.show() print(y_pred_inv)
x_train, x_test, y_train, y_test = train_test_split( dp, indp, test_size= 0.2, random_state= 0)""" # Feature scalling from sklearn.preprocessing import StandardScaler sc_x = StandardScaler() sc_y = StandardScaler() x = sc_x.fit_transform(x) y = sc_y.fit_transform(y) # Fitting the Regression to the dataset from sklearn.svm import SVR regresor = SVR(kernel='rbf') regresor.fit(x, y) # predicitng a new result with polynomial regresion y_pred = sc_y.inverse_transform( regresor.predict(sc_x.transform(np.array([[6.5]]).reshape(1, 1)))) # visualising SVR regression plt.scatter(x, y, c='b') plt.plot(x, regresor.predict(x), c='r') plt.title('salary vs possiotion (SVR)') plt.xlabel('position') plt.ylabel('salary') plt.show() # visualising svr regression x_grid = np.arange(min(x), max(x), 0.1) x_grid = x_grid.reshape((len(x_grid), 1)) plt.scatter(x, y, c='g') plt.plot(x_grid, regresor.predict(x_grid), c='r') plt.title('salary vs possiotion (SVR)')
param_grid = [{'n_neighbors': [5, 15, 52, 168], 'weights': ['uniform']}] clf = GridSearchCV(neigh, param_grid, scoring='r2', cv=10, refit=True) # scoring='neg_mean_squared_error' clf.fit(X_scaled, Y_scaled) # loading all data points (not just the inliers) ------------------------------- X = country_data.loc[:, columns_to_consider].values.reshape(-1, len( columns_to_consider)) # comment to ignore outliers in the predictions X_scaled = scaler_X.transform(X) X_scaled[:, 0] = FR_RD_scaler * X_scaled[:, 0] Y = country_data.loc[:, ['Price', 'Demand']].values.reshape(-1, 2) # comment to ignore outliers in the predictions Y_scaled = scaler_Y.transform(Y) # wind_data_year_tech = wind_data_year_tech[inliers == 1] #uncomment to ignore outliers in the predictions # prediction and error calculation for base model Y_pred = clf.predict(X_scaled) Y_pred_org_scale = scaler_Y.inverse_transform(Y_pred) mse = mean_squared_error(Y[:, 0], Y_pred_org_scale[:, 0]) r2 = r2_score(Y[:, 0], Y_pred_org_scale[:, 0]) orders = np.argsort(X_scaled[:, 0].flatten()) X_scaled_sorted = X_scaled[orders] X_sorted_org_scale = X[orders] Y_pred_ordered = clf.predict(X_scaled_sorted) Y_pred_ordered_org_scale = scaler_Y.inverse_transform(Y_pred_ordered) # prediction and error calculation for reduced residual demand X_reduced = np.copy(X) X_reduced[:, 0] = X_reduced[:, 0] - res_gen X_reduced_scaled = scaler_X.transform(X_reduced) X_reduced_scaled[:, 0] = FR_RD_scaler * X_reduced_scaled[:, 0] Y_pred_reduced = clf.predict(X_reduced_scaled) Y_pred_reduced_org_scale = scaler_Y.inverse_transform(Y_pred_reduced)
# plot.figure(figsize=(4, 4)) plot.figure() plot.imshow(current_ard, interpolation='nearest', aspect='auto', origin='upper') plot.colorbar() plot.title('Latent Dim {}'.format(q)) plot.show() quit(0) # Inverse the normalization and view predicted image. show_plots = False save_plots = False if show_plots or save_plots: ground_truth = scaler.inverse_transform(test_data) mrd_predicted_mean = np.load(mrd_results_file)['predicted_mean'] dp_gp_lvm_predicted_mean = np.load( dp_gp_lvm_results_file)['predicted_mean'] mrd_predicted_images = scaler.inverse_transform( np.hstack((test_data[:, :num_observed_dimensions], mrd_predicted_mean))) dp_gp_lvm_predicted_images = scaler.inverse_transform( np.hstack((test_data[:, :num_observed_dimensions], dp_gp_lvm_predicted_mean))) # assert ground_truth.shape[0] == predicted_images.shape[0] for i in range(ground_truth.shape[0]): fig_size = (3, 2) # (10, 5) fig, (ax1, ax2, ax3) = plot.subplots(nrows=1, ncols=3, sharey='row',
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # Feature Scaling from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() sc_y = StandardScaler() X_train = sc_X.fit_transform(X_train) y_train = sc_y.fit_transform(y_train) # Training the SVR model on the Training set from sklearn.svm import SVR regressor = SVR(kernel='rbf') regressor.fit(X_train, y_train) # Predicting the Test set results y_pred = sc_y.inverse_transform(regressor.predict(sc_X.transform(X_test))) np.set_printoptions(precision=2) print( np.concatenate( (y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1)) # Evaluating the Model Performance from sklearn.metrics import r2_score print("Model Score") print(r2_score(y_test, y_pred))
X = data[:-1] y = data[1:] X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False) train_data_gen = TimeseriesGenerator(X_train, y_train, length=window_size, batch_size=batch_size, shuffle=False) test_data_gen = TimeseriesGenerator(X_test, y_test, length=window_size, batch_size=batch_size, shuffle=False) model = Sequential() model.add(CuDNNGRU(4, input_shape=(window_size, 1,))) model.add(Dense(1)) model.compile(loss='mean_squared_error', optimizer='adam') history = model.fit_generator(train_data_gen, epochs=epochs).history index = [df['Open'][0]] for i, d in enumerate(scaler.inverse_transform(data)): index.append(index[i] + d) index_train = [df['Open'][0]] for i, d in enumerate(scaler.inverse_transform(model.predict_generator(train_data_gen))): index_train.append(index_train[i] + d) index_test = [index_train[-1]] for i, d in enumerate(scaler.inverse_transform(model.predict_generator(test_data_gen))): index_test.append(index_test[i] + d) begin = window_size join = begin + len(index_train) end = join + len(index_test) plt.plot(index) plt.plot(list(range(begin, join)), index_train)
# 从sklearn.neighbors导入KNeighborRegressor(K近邻回归器)。 from sklearn.neighbors import KNeighborsRegressor # 初始化K近邻回归器,并且调整配置,使得预测的方式为平均回归:weights='uniform'。 uni_knr = KNeighborsRegressor(weights='uniform') uni_knr.fit(X_train, y_train) uni_knr_y_predict = uni_knr.predict(X_test) # 初始化K近邻回归器,并且调整配置,使得预测的方式为根据距离加权回归:weights='distance'。 dis_knr = KNeighborsRegressor(weights='distance') dis_knr.fit(X_train, y_train) dis_knr_y_predict = dis_knr.predict(X_test) from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error # 使用R-squared、MSE以及MAE三种指标对平均回归配置的K近邻模型在测试集上进行性能评估。 print('R-squared value of uniform-weighted KNeighorRegression:', uni_knr.score(X_test, y_test)) print( 'The mean squared error of uniform-weighted KNeighorRegression:', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(uni_knr_y_predict))) print('The mean absoluate error of uniform-weighted KNeighorRegression', mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(uni_knr_y_predict))) # 使用R-squared、MSE以及MAE三种指标对根据距离加权回归配置的K近邻模型在测试集上进行性能评估。 print('R-squared value of distance-weighted KNeighorRegression:', dis_knr.score(X_test, y_test)) print( 'The mean squared error of distance-weighted KNeighorRegression:', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(dis_knr_y_predict))) print('The mean absoluate error of distance-weighted KNeighorRegression:', mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(dis_knr_y_predict)) )
from matplotlib.colors import ListedColormap X_set, y_set = X_train, y_train #X_set, y_set = X_test, y_test X1_n, X2_n = np.meshgrid( np.arange(start=X_set[:, 0].min(), stop=X_set[:, 0].max() + 1, step=(abs(X_set[:, 0].min()) + abs(X_set[:, 0].max() + 1)) / 1000), #step = 1), np.arange(start=X_set[:, 1].min(), stop=X_set[:, 1].max() + 1, step=(abs(X_set[:, 1].min()) + abs(X_set[:, 1].max() + 1)) / 1000)) #step = 10000)) X_set, y_set = sc_X.inverse_transform(X_train), y_train #X_set, y_set = sc_X.inverse_transform(X_test), y_test X1, X2 = np.meshgrid( np.arange(start=X_set[:, 0].min(), stop=X_set[:, 0].max() + 10, step=(abs(X_set[:, 0].max() + 10 - abs(X_set[:, 0].min())) / 1000)), np.arange( start=X_set[:, 1].min(), stop=X_set[:, 1].max() + 10000, #step = 0.01)) step=(abs(X_set[:, 1].max() + 10000 - abs(X_set[:, 1].min())) / 1000))) plt.contourf(X1, X2, classifier.predict(np.array([X1_n.ravel(), X2_n.ravel()
""" Created on Mon Apr 15 02:50:11 2019 @author: mohamed nabil """ import numpy as np import pandas as pd import matplotlib.pyplot as plt file = pd.read_csv("Position_Salaries.csv") x = file.iloc[:, 1:2].values y = file.iloc[:, 2:].values #applying feature scaling from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() x = sc_X.fit_transform(x) sc_Y = StandardScaler() y = sc_Y.fit_transform(y) from sklearn.svm import SVR # you should use a suitable kernel model = SVR(kernel='rbf') model.fit(x, y) y_pred = model.predict(x) plt.scatter(sc_X.inverse_transform(x), sc_Y.inverse_transform(y), color="blue") plt.plot(sc_X.inverse_transform(x), sc_Y.inverse_transform(model.predict(x)), color="yellow") plt.show()
ssy = StandardScaler().fit(y_train) y_train_std = ssy.transform(y_train) y_test_std = ssy.transform(y_test) '''preproccessing-adding column for bias term ''' ones = np.ones(X_train_std.shape[0]).reshape(-1, 1) X_train_std = np.concatenate((ones, X_train_std), axis=1) ones = np.ones(X_test_std.shape[0]).reshape(-1, 1) X_test_std = np.concatenate((ones, X_test_std), axis=1) # no ridge # train Lin = OrdinaryLinearRegression(Ridge=False) Lin.fit(X_train_std, y_train_std) y_pred_train_std = Lin.predict(X_train_std) y_pred_train = ssy.inverse_transform(y_pred_train_std) Base_MSE_train = Lin.score(y_train, y_pred_train) # test y_pred_test_std = Lin.predict(X_test_std) y_pred_test = ssy.inverse_transform(y_pred_test_std) Base_MSE_test = Lin.score(y_test, y_pred_test) # with ridge Lambda_list = np.arange(0, 2, 0.001) MSE_list = [] for lambdaval in Lambda_list: Lin = OrdinaryLinearRegression(Ridge=True, Lambda=lambdaval) Lin.fit(X_train_std, y_train_std) # y_pred_train_std = Lin.predict(X_train_std) # y_pred_train = ssy.inverse_transform(y_pred_train_std) # Base_MSE_train_ridge = Lin.score(y_train, y_pred_train) # test
N, D = X_train.shape y_train_A = y_train_A.reshape(-1) y_train_B = y_train_B.reshape(-1) y_test_A = y_test_A.reshape(-1) y_test_B = y_test_B.reshape(-1) # from sklearn.svm import SVR # model_A = SVR(kernel = 'rbf') # model_A.fit(X_train, y_train_A) # model_B = SVR(kernel = 'rbf') # model_B.fit(X_train, y_train_B) y_pred_A = model_A.predict(X_test) y_pred_A = objy.inverse_transform(y_pred_A) y_test_A = objy.inverse_transform(y_test_A) X_test = obj.inverse_transform(X_test) print(y_test_A.shape) print(y_pred_A.shape) y_pred_B = model_B.predict(X_test) y_pred_B = objy.inverse_transform(y_pred_B) y_test_B = objy.inverse_transform(y_test_B) X_test = obj.inverse_transform(X_test) print(y_test_B.shape) print(y_pred_B.shape) shape = (174, 142, 1) imageL = X_test[:, 0].reshape(shape) imagea = y_pred_A.reshape(shape)
hist = model.fit(xtrain, ytrain, epochs=epo, batch_size=64, callbacks=[cbks], validation_freq=epostep, validation_data=(xtest, ytest), verbose=2) stop = time.process_time() print("Print Time for taining: ", stop - start) trainlossall = np.array(hist.history['mean_absolute_error']) testlossall = np.array(hist.history['val_mean_absolute_error']) # Predict LUMO with model pred_test = scaler.inverse_transform(model.predict(xtest)) true_test = scaler.inverse_transform(ytest) mae_valid = np.mean(np.abs(pred_test - true_test)) # Plot loss vs epochs plt.figure() plt.plot(np.arange(trainlossall.shape[0]), trainlossall, label='Training Loss', c='blue') plt.plot(np.arange(epostep, epo + epostep, epostep), testlossall, label='Test Loss', c='red') plt.scatter([trainlossall.shape[0]], [mae_valid], label="{0:0.4f} ".format(mae_valid) + "[" + data_unit + "]",
# Feature Scaling from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() sc_Y = StandardScaler() X = sc_X.fit_transform(X) Y = sc_Y.fit_transform(Y) #print(X) #print(Y) # Fitting SVR to the dataset from sklearn.svm import SVR regressor = SVR(kernel='rbf') regressor.fit(X, Y) # Predicting a new result y_pred = regressor.predict(sc_X.transform(np.array([6.5]).reshape(-1, 1))) print("X: 6.5, Y: %8.8f" % sc_Y.inverse_transform(y_pred)) # Visualising the Linear Regression results plt.scatter(sc_X.inverse_transform(X), sc_Y.inverse_transform(Y), color='red') # Making predicting curve more smoothly X_grid = np.arange(min(X), max(X), 0.1) X_grid = X_grid.reshape(-1, 1) X = X_grid plt.plot(sc_X.inverse_transform(X), sc_Y.inverse_transform(regressor.predict(X)), color='blue') plt.title('Supporting Vector Regression') plt.xlabel('Level') plt.ylabel('Salary') plt.show()
plt.plot(np.arange(len(fake_ex)) * 20., fake_ex) plt.show() #Store losses losses = {'D': [], 'A': []} acc = {'D': [], 'A': []} #Main training loop epochs = 50 batch_size = 40 for i in range(epochs): if (i % 5 == 0): fake_ex = G.predict(np.random.uniform(-1, 1, size=(1, 100))) fake_ex = tf.inverse_transform(fake_ex * scale) plot_losses(i, losses, fake_ex[0]) print '%d of %d' % (i, epochs) #Make noise to feed into GAN, label with counterfeit 1's noise_net = np.random.uniform(-1, 1, size=(batch_size, 100)) noise_label = np.ones(len(noise_net)) #Freeze weights of D set_trainability(G, True) set_trainability(D, False) #Compilation to ensure they actually freeze A.compile(loss='binary_crossentropy', optimizer=optA, metrics=['accuracy']) D.compile(loss='binary_crossentropy', optimizer=optD, metrics=['accuracy'])
#### cluster the data ############################################################### ll_arr = np.asarray(list(zip(df["lat"], df["lng"])), dtype="float64") stdScaler = StandardScaler() ll_arr = stdScaler.fit_transform(ll_arr) k = int(math.sqrt(len(df.lat))) # rule of thumb, can do better choosing k_means = cluster.KMeans(n_clusters=k) k_means.fit_predict(ll_arr) k_means.fit_predict(ll_arr) data_labels = k_means.labels_ data_cluster_centers = k_means.cluster_centers_ data_cluster_centers = stdScaler.inverse_transform(k_means.cluster_centers_) n_clusters = len(data_cluster_centers) data_num_each_cluster = np.zeros((n_clusters, 1)) for i in range(n_clusters): data_num_each_cluster[i, 0] = (data_labels == i).sum() ############################################################### #### save results ############################################################### records = { "labels": data_labels.tolist(), "centers": data_cluster_centers.tolist(), "size": data_num_each_cluster.tolist(), "date_created": datetime.datetime.today(),
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1)) preds = regressor.predict(X_test) #Test Set Metrics for Performance Evaluvation #R2 Score metric from sklearn.metrics import r2_score print(r2_score(y_test, preds)) #MSE Score #from sklearn.metrics import mean_squared_error #print(mean_squared_error(y_test[],preds)) #MAE Score from sklearn.metrics import mean_absolute_error print(mean_absolute_error(y_test, preds)) # Visualization y_test = sc.inverse_transform(y_test) preds = sc.inverse_transform(preds) #Plot of a Small Subset of the Test Set plt.plot(y_test[0:720], color='blue', label='Real voltage') plt.plot(preds[0:720], color='red', label='Predicted voltage') plt.title('output - Univariate Single Step Forecasting') plt.xlabel('Hours') plt.ylabel('output') plt.legend() plt.show()
#plt.savefig('images/10_06.png', dpi=300) plt.show() print('Slope: %.3f' % lr.w_[1]) print('Intercept: %.3f' % lr.w_[0]) num_rooms_std = sc_x.transform(np.array([[5.0]])) price_std = lr.predict(num_rooms_std) print("Price in $1000s: %.3f" % sc_y.inverse_transform(price_std)) # ## Estimating the coefficient of a regression model via scikit-learn slr = LinearRegression() slr.fit(X, y) y_pred = slr.predict(X) print('Slope: %.3f' % slr.coef_[0])
""" import pandas as pd import matplotlib.pyplot as plt import numpy as np data = pd.read_csv("Position_Salaries.csv") x = data.iloc[:, 1:2].values y = data.iloc[:, 2].values from sklearn.preprocessing import StandardScaler sc_x = StandardScaler() sc_y = StandardScaler() x = sc_x.fit_transform(x) y = sc_y.fit_transform(np.reshape(y, (10, 1))) from sklearn.svm import SVR regressor = SVR(kernel='rbf') regressor.fit(x, y) y_pred = sc_y.inverse_transform(regressor.predict(sc_x.transform([[6.5]]))) plt.scatter(x, y, color='red') plt.plot(x, regressor.predict(x), color='blue') plt.title('truth or bluff (SVR)') plt.xlabel('position level') plt.ylabel('salary') plt.show()
with model.session: model.train(trnX_L=trnX_L, trnXs_L=trnXs_L, trnY_L=trnY_L, trnX_U=trnX_U, trnXs_U=trnXs_U, valX_L=valX_L, valXs_L=valXs_L, valY_L=valY_L, valX_U=valX_U, valXs_U=valXs_U) model.saver.save(model.session, save_uri) ## property prediction performance tstY_hat = scaler_Y.inverse_transform(model.predict(tstX)) for j in range(dim_y): print([j, mean_absolute_error(tstY[:, j], tstY_hat[:, j])]) ## unconditional generation for t in range(10): smi = model.sampling_unconditional() print([t, smi, get_property(smi)]) ## conditional generation (e.g. MolWt=250) yid = 0 ytarget = 250. ytarget_transform = (ytarget - scaler_Y.mean_[yid]) / np.sqrt( scaler_Y.var_[yid])
labelencoder = LabelEncoder() X[:, 2] = labelencoder.fit_transform(X[:, 2]) Z[:, 2] = labelencoder.fit_transform(Z[:, 2]) onehotencoder = OneHotEncoder(categorical_features=[2]) X = onehotencoder.fit_transform(X).toarray() Z = onehotencoder.fit_transform(Z).toarray() from sklearn.preprocessing import StandardScaler fsx = StandardScaler() fsy = StandardScaler() X = fsx.fit_transform(X) Z = fsx.transform(Z) y = fsy.fit_transform(y) from sklearn.ensemble import RandomForestRegressor regressor = RandomForestRegressor(n_estimators=10, criterion='mse') regressor = regressor.fit(X, y) y_pred = regressor.predict(Z) y_pred = fsy.inverse_transform(y_pred) new_dataset = test_dataset.drop([ 'Item_Weight', 'Item_Visibility', 'Item_Fat_Content', 'Item_Type', 'Item_MRP', 'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type' ], axis=1) new_dataset['Item_Outlet_Sales'] = y_final new_dataset.to_csv('C:/Users/bhupe/Desktop/Big Mart/Prt.csv', index=False)
class GaussianProcessRegression(BaseEstimator, RegressorMixin, StanCacheMixin): def __init__(self, n_jobs=-1, warmup=1000, samples_per_chain=1000, n_chains=4, normalize=True, max_samples_mem=500): BaseEstimator.__init__(self) StanCacheMixin.__init__(self, MODEL_DIR) self.stan_model, self.predict_model = self._load_compiled_models() # The control parameters for NUTS, most are left as default control = { "metric": "diag_e", # Type of mass matrix (diag_e default) "stepsize_jitter": 0.05, # Slight randomization of stepsizes "adapt_engaged": True, "adapt_gamma": 0.05, # Regularization scale "adapt_delta": 0.8, # Target acceptance probability (.8 default) "adapt_kappa": 0.75, # Relaxation exponent "adapt_t0": 10, # Adaptation iteration offset "adapt_init_buffer": 75, # First fast adapt period "adapt_term_buffer": 50, # Last fast adapt period "adapt_window": 25, # First slow adapt period "max_treedepth": 10, # N_leapfrog ~ 2**max_treedepth } self.stan_fitting_kwargs = { "chains": n_chains, "iter": samples_per_chain + warmup, "warmup": warmup, "init": "random", "init_r": 1.0, "n_jobs": n_jobs, "control": control } self._fit_results = None self._fit_X = None self.normalize = normalize self.max_samples_mem = max_samples_mem if normalize: self._y_ss = StandardScaler(with_mean=True) self._X_ss = StandardScaler() return def _posterior(self, X, **stan_fitting_kwargs): N, M = X.shape Xt = self._fit_X Nt = Xt.shape[0] if self.normalize: X = self._X_ss.transform(X) y0, alpha, rho, nu, f, sigma = self._get_param_posterior() # Ensure we don't use an excessive amount of memory mem_samples = (len(y0) * 8 * N**2) / 1e6 ss = int(1 + (mem_samples // self.max_samples_mem)) # Subsample K = len(y0[::ss]) data = { "Nt": Nt, "N": N, "M": M, "K": K, "X": X, "Xt": Xt, "alpha": alpha[::ss], "rho": rho[::ss], "nu": nu[::ss], "sigma": sigma[::ss], "f": f[::ss], "y0": y0[::ss] } fit_kwargs = self._setup_predict_kwargs(data, stan_fitting_kwargs) fit_kwargs["iter"] = 1 fit_kwargs["chains"] = 1 predictions = self.predict_model.sampling(**fit_kwargs, algorithm="Fixed_param") y_samples = predictions.extract("y_samples")["y_samples"][0, ...] y_hat = predictions.extract("y_hat")["y_hat"].ravel() if self.normalize: y_samples = np.vstack( [self._y_ss.inverse_transform(y_s) for y_s in y_samples]) y_hat = self._y_ss.inverse_transform(y_hat) return y_hat, y_samples def _get_param_posterior(self): if self._fit_results is None: raise NotFittedError("Model isn't fit!") df = self._fit_results.to_dataframe() y0 = df.loc[:, "y0"].to_numpy() alpha = df.loc[:, "alpha"].to_numpy() rho = df.loc[:, "rho"].to_numpy() nu = df.loc[:, "nu"].to_numpy() sigma = df.loc[:, "sigma"].to_numpy() f = df.loc[:, [c for c in df.columns if c[:2] == "f["]].to_numpy() return y0, alpha, rho, nu, f, sigma def fit(self, X, y, sample_weight=None, **stan_fitting_kwargs): if sample_weight is not None: raise NotImplementedError("sampling weighting is not implemented.") N, M = X.shape if self.normalize: y = self._y_ss.fit_transform(y) X = self._X_ss.fit_transform(X) y = y.ravel() data = {"N": N, "M": M, "X": X, "y": y} pars = ["y0", "alpha", "rho", "nu", "sigma", "f"] stan_fitting_kwargs.update({"pars": pars}) fit_kwargs = self._setup_predict_kwargs(data, stan_fitting_kwargs) self._fit_results = self.stan_model.sampling(**fit_kwargs) self._fit_X = X print( self._fit_results.stansummary( pars=["y0", "alpha", "rho", "nu", "sigma"], probs=[0.1, 0.5, 0.9])) return def predict(self, X, ret_posterior=False, **stan_fitting_kwargs): y_hat, y_samples = self._posterior(X, **stan_fitting_kwargs) if ret_posterior: return y_hat, y_samples return y_hat def plot_posterior_params(self, show=False): """ A helper method to plot the posterior parameter distribution. Will raise an error if .fit hasn't been called. """ param_df = self._fit_results.to_dataframe() col_names = ["y0", "alpha", "rho", "nu", "sigma"] var_names = [ "$y_0$", "$\\alpha$", "$\\rho$", "$\\mathsf{log}_{10}(\\nu)$", "$\\sigma$" ] param_df.loc[:, "nu"] = np.log10(param_df.loc[:, "nu"]) param_df = param_df.loc[:, col_names] param_df = param_df.rename( {frm: to for frm, to in zip(col_names, var_names)}, axis=1) fig, ax = plt.subplots(1, 1) ax.set_title( "Parameter Posterior Marginals: " "$y \\sim \\mathcal{T}(\\nu, y_0 + \mathcal{GP}(\\alpha, \\rho), " "\\sigma)$") sns.boxplot(data=param_df.melt(value_name="Posterior Samples", var_name="Parameter"), x="Parameter", y="Posterior Samples", ax=ax) if show: plt.show() return fig, ax
clf = LogisticRegression() scaler = StandardScaler() # create a linear model with LogisticRegression model = LinearModel(clf) # fit the classifier on MEG data X = scaler.fit_transform(meg_data) model.fit(X, labels) # Extract and plot spatial filters and spatial patterns for name, coef in (('patterns', model.patterns_), ('filters', model.filters_)): # We fitted the linear model onto Z-scored data. To make the filters # interpretable, we must reverse this normalization step coef = scaler.inverse_transform([coef])[0] # The data was vectorized to fit a single model across all time points and # all channels. We thus reshape it: coef = coef.reshape(len(meg_epochs.ch_names), -1) # Plot evoked = EvokedArray(coef, meg_epochs.info, tmin=epochs.tmin) evoked.plot_topomap(title='MEG %s' % name) ############################################################################### # Let's do the same on EEG data using a scikit-learn pipeline X = epochs.pick_types(meg=False, eeg=True) y = epochs.events[:, 2]
# Feature Scaling # we dont have any training or test set from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() sc_y = StandardScaler() X = sc_X.fit_transform(X) #going to fit in matrix X. fit transform tool is going to scale X y = sc_y.fit_transform(y) # Fitting the Regression Model to the dataset from sklearn.svm import SVR regressor = SVR(kernel = 'rbf') regressor.fit(X, y) # Predicting a new result #we need the original scale of salary also y_pred = sc_y.inverse_transform(regressor.predict(sc_X.transform(np.array([[6.5]])))) #transform() needs array. 2 [[]] because there's only one cell # Visualising the SVR results plt.scatter(X, y, color = 'red') plt.plot(X, regressor.predict(X), color = 'blue') plt.title('Truth or Bluff (SVR)') plt.xlabel('Position level') plt.ylabel('Salary') plt.show() # Visualising the SVR results (for higher resolution and smoother curve) X_grid = np.arange(min(X), max(X), 0.01) # choice of 0.01 instead of 0.1 step because the data is feature scaled X_grid = X_grid.reshape((len(X_grid), 1)) plt.scatter(X, y, color = 'red') plt.plot(X_grid, regressor.predict(X_grid), color = 'blue') plt.title('Truth or Bluff (SVR)')
def main(argv): dbscan_heuristic_mode = False dpgmm_mode = False do_plot_clusters = False do_dump_clusters = False try: opts, args = getopt.getopt(argv,"hegdp") except getopt.GetoptError: print('elviz_cluster.py [-h] [-e] [-g] [-d] [-p]') sys.exit(2) for opt, arg in opts: if opt == '-h': print('elviz_cluster.py [-h] [-e]') print(' -h = help, -e = run dbscan' + ' epsilon heuristic plot generation code') print(' -g = use a DPGMM for clustering') print(' -p = plot the clusters to a PDF file') print(' -d = dump the clusters to a text file') sys.exit() elif opt == '-e': dbscan_heuristic_mode = True elif opt == '-g': dpgmm_mode = True elif opt == '-p': do_plot_clusters = True elif opt == '-d': do_dump_clusters = True [elviz_data, combined_df] = read_pickle_or_CSVs(DATA_PICKLE, RAW_DATA_DIR) # Setup plotting limits print("determining plotting limits") limits = {"x": [combined_df['Average fold'].min(), MAX_AVG_FOLD], "y": [combined_df['Reference GC'].min(), combined_df['Reference GC'].max()]} # Below changed in favor of fixed MAX # limits["x"] = [combined_df['Average fold'].min(), combined_df['Average fold'].max()] # fixed MAX below print("normalizing data prior to clustering") # normalize the combined data to retrieve the normalization parameters scaler = StandardScaler().fit(combined_df[CLUSTER_COLUMNS]) # serializing outputs if dbscan_heuristic_mode: print("making DBSCAN heuristic plots") dbscan_heuristic(elviz_data, scaler) os.sys.exit() print("serially processing files") for filename in elviz_data.keys(): pdf_filename = filename.replace("csv", "pdf") # skip if the PDF already exists if os.path.isfile(RESULTS_DIR + pdf_filename): print("skiping file %s" % filename) continue print("processing file %s" % filename) df = elviz_data[filename] # create a multipage PDF for storing the plots with PdfPages(RESULTS_DIR + pdf_filename) as pdf: # find unique values of taxonomy columns dfgb = df.groupby(['Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species']) for key in dfgb.indices.keys(): idx = dfgb.indices[key] tax_rows = df.iloc[idx] if len(tax_rows) < MIN_ROWS: continue # normalize all dimensions to be used in clustering, e.g. GC, coverage, rpk # reuse the scaler we created from all of the data for the transform tax_rows_cluster_columns = scaler.transform(tax_rows[CLUSTER_COLUMNS]) if not dpgmm_mode: db = DBSCAN(eps=EPS, min_samples=MIN_SAMPLES) db.fit(tax_rows_cluster_columns) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ else: db = mixture.DPGMM(n_components=DPGMM_N_COMPONENTS, n_iter=100, covariance_type='full', alpha=100, verbose=0) db.fit(tax_rows_cluster_columns) Y_ = db.predict(tax_rows_cluster_columns) for i, (mean, covar) in enumerate(zip( db.means_, db._get_covars())): if not np.any(Y_ == i): continue #plt.scatter(X[Y_ == i, 0], X[Y_ == i, 1], .8, color=color) labels = Y_ core_samples_mask = np.zeros_like(labels, dtype=bool) core_samples_mask[:] = True #print(labels) #print(type(labels)) # number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) if n_clusters_ < 1: continue #print(tax_rows_cluster_columns) title = ', '.join(key) if (do_plot_clusters): plot_clusters(pdf, scaler.inverse_transform(tax_rows_cluster_columns), title, labels, core_samples_mask, limits) if (do_dump_clusters): dump_clusters(filename, key, labels, tax_rows[CONTIG_COLUMN]);
est = ensemble.RandomForestRegressor() gs = GridSearchCV(est, cv=10, param_grid=hyper_params, verbose=2, n_jobs=n_jobs, scoring='r2') t0 = time.time() gs.fit(x_train, y_train.ravel()) runtime = time.time() - t0 print("Complexity and bandwidth selected and model fitted in %.6f s" % runtime) train_score_mse = mean_squared_error( sc_y.inverse_transform(y_train), sc_y.inverse_transform(gs.predict(x_train))) train_score_mae = mean_absolute_error( sc_y.inverse_transform(y_train), sc_y.inverse_transform(gs.predict(x_train))) train_score_evs = explained_variance_score( sc_y.inverse_transform(y_train), sc_y.inverse_transform(gs.predict(x_train))) train_score_me = max_error(sc_y.inverse_transform(y_train), sc_y.inverse_transform(gs.predict(x_train))) train_score_r2 = r2_score(sc_y.inverse_transform(y_train), sc_y.inverse_transform(gs.predict(x_train))) test_score_mse = mean_squared_error(sc_y.inverse_transform(y_test), sc_y.inverse_transform(gs.predict(x_test))) test_score_mae = mean_absolute_error(
# Feature Scaling from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() sc_y = StandardScaler() X = sc_X.fit_transform(X) y = sc_y.fit_transform(y) # Fitting SVR to the dataset from sklearn.svm import SVR regressor = SVR(kernel = 'rbf') regressor.fit(X, y) # Predicting a new result y_pred = regressor.predict(6.5) y_pred = sc_y.inverse_transform(y_pred) # Visualising the SVR results plt.scatter(X, y, color = 'red') plt.plot(X, regressor.predict(X), color = 'blue') plt.title('Truth or Bluff (SVR)') plt.xlabel('Position level') plt.ylabel('Salary') plt.show() # Visualising the SVR results (for higher resolution and smoother curve) X_grid = np.arange(min(X), max(X), 0.01) # choice of 0.01 instead of 0.1 step because the data is feature scaled X_grid = X_grid.reshape((len(X_grid), 1)) plt.scatter(X, y, color = 'red') plt.plot(X_grid, regressor.predict(X_grid), color = 'blue') plt.title('Truth or Bluff (SVR)')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)""" # Feature Scaling from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() sc_y = StandardScaler() X = sc_X.fit_transform(X) y = sc_y.fit_transform(y.reshape(-1, 1)) # Fitting SVR to the dataset from sklearn.svm import SVR regressor = SVR(kernel='rbf') regressor.fit(X, y) # Predicting a new result y_pred = sc_y.inverse_transform( regressor.predict(sc_X.transform(np.array([[6.5]])))) # Visualising the SVR results plt.scatter(X, y, color='red') plt.plot(X, regressor.predict(X), color='blue') plt.title('Truth or Bluff (SVR)') plt.xlabel('Position level') plt.ylabel('Salary') plt.show() # Visualising the SVR results (for higher resolution and smoother curve) X_grid = np.arange( min(X), max(X), 0.01 ) # choice of 0.01 instead of 0.1 step because the data is feature scaled X_grid = X_grid.reshape((len(X_grid), 1)) plt.scatter(X, y, color='red')
class BayesLinearRegression(BaseEstimator, RegressorMixin, StanCacheMixin): def __init__(self, n_jobs=-1, warmup=1000, samples_per_chain=1000, n_chains=4, normalize=True, max_samples_mem=500): """ An interface to the following stan model y0 ~ cauchy(0, 1); nu ~ cauchy(0, 1); sigma ~ normal(0, 1); // half-normal lam ~ exponential(1); theta ~ normal(0, lam); y ~ student_t(nu, y0 + Q * theta, sigma); params: n_jobs: Number of cores to use warmup: Number of warmup iterations for HMC, roughly analagous to a burnin period. samples_per_chain: Number of samples to draw per chain n_chains: Number of chains (should run at least 2) normalize: Whether to normalize the data before feeding it to stan. This is necessary as the priors in the model are fixed. max_samples_mem: A parameter to prevent blowing up all the memory when sampling the posterior predictive. """ BaseEstimator.__init__(self) StanCacheMixin.__init__(self, MODEL_DIR) self.stan_model, self.predict_model = self._load_compiled_models() self.stan_fitting_kwargs = { "chains": n_chains, "iter_sampling": samples_per_chain, "iter_warmup": warmup, "inits": 1, "metric": "diag_e", "adapt_delta": 0.8 } self._fit_results = None self.normalize = normalize self.max_samples_mem = max_samples_mem if normalize: self._y_ss = StandardScaler() self._X_ss = StandardScaler() return def get_results(self, params=None, results_obj=None): if results_obj is None: results_obj = self._fit_results param_df = results_obj.get_drawset(params) param_df = param_df.rename( { param: "[".join(param.split(".")) + "]" for param in param_df.columns if "." in param }, axis="columns") return param_df def extract_ary(self, param, results_obj=None): param_df = self.get_results([param], results_obj) return param_df[param].to_numpy() def _posterior(self, X, **stan_fitting_kwargs): N, M = X.shape if self.normalize: X = self._X_ss.transform(X) y0, beta, sigma, nu = self._get_param_posterior() # Ensure we don't use an excessive amount of memory # TODO: max_samples_mem is a massive underestimate of # TODO: the amount of memory used, why? mem_samples = len(y0) * 8 * N / 1e6 ss = int(1 + (mem_samples // self.max_samples_mem)) # Subsample K = len(y0[::ss]) data = { "N": N, "M": M, "K": K, "beta": beta[::ss], "y0": y0[::ss], "sigma": sigma[::ss], "X": X, "nu": nu[::ss] } fit_kwargs = stan_fitting_kwargs fit_kwargs["iter_sampling"] = 1 fit_kwargs["data"] = data fit_kwargs["fixed_param"] = True predictions = self.predict_model.sample(**fit_kwargs) y_samples = self.extract_ary("y", predictions)[0, ...] y_hat = self.extract_ary("y_hat", predictions).ravel() if self.normalize: y_samples = np.vstack( [self._y_ss.inverse_transform(y_s) for y_s in y_samples]) y_hat = self._y_ss.inverse_transform(y_hat) return y_hat, y_samples def _get_param_posterior(self): if self._fit_results is None: raise NotFittedError("Model isn't fit!") df = self.get_results() M = sum(c[:4] == "beta" for c in df.columns) y0 = df.loc[:, "y0"].to_numpy() beta = df.loc[:, [f"beta[{j}]" for j in range(1, M + 1)]].to_numpy() sigma = df.loc[:, "sigma"].to_numpy() nu = df.loc[:, "nu"].to_numpy() return y0, beta, sigma, nu def fit(self, X, y, sample_weight=None, **stan_fitting_kwargs): """ "Fit" the model, that is, sample from the posterior. params: X (n_examples, m_features): Regressors y (n_examples): The targets sample_weight: NotImplemented stan_fitting_kwargs: To be passed to pystan's .sampling method """ if sample_weight is not None: raise NotImplementedError("sampling weighting is not implemented.") N, M = X.shape if self.normalize: y = self._y_ss.fit_transform(y) X = self._X_ss.fit_transform(X) y = y.ravel() data = {"N": N, "M": M, "X": X, "y": y} fit_kwargs = self._setup_predict_kwargs(data, stan_fitting_kwargs) self._fit_results = self.stan_model.sample(**fit_kwargs) print(self._fit_results.summary()) print(self._fit_results.diagnose()) print(self._fit_results) return def predict(self, X, ret_posterior=False, **stan_fitting_kwargs): """ Produce samples from the predictive distribution. This can be used for either prior predictive checks or for posterior predictions. params: X (n_examples, m_features): Regressors ret_posterior: Whether or not to return all the posterior samples. If false, we only return the posterior mean, which is dramatically faster. stan_fitting_kwargs: kwargs for pystan's sampling method. returns: y_hat (n_examples), y_samples (k_samples, n_examples) -- (if ret_posterior=True) y_hat (n_examples) -- (otherwise) """ y0, beta, _, _ = self._get_param_posterior() y0_mean = np.mean(y0) beta_mean = np.mean(beta, axis=0) if self.normalize: y_hat = y0_mean + self._X_ss.transform(X) @ beta_mean y_hat = self._y_ss.inverse_transform(y_hat) else: y_hat = y0_mean + X @ beta_mean if ret_posterior: y_hat, y_samples = self._posterior(X, **stan_fitting_kwargs) return y_hat, y_samples else: return y_hat def plot_posterior_params(self, show=False): """ A helper method to plot the posterior parameter distribution. Will raise an error if .fit hasn't been called. """ param_df = self.get_results() M = sum([c[:4] == "beta" for c in param_df.columns]) col_names = (["y0", "sigma", "nu"] + [f"beta[{j}]" for j in range(1, M + 1)]) var_names = (["$y_0$", "$\\sigma$", "$\\mathsf{log}_{10}(\\nu)$"] + ["$\\beta_{{{}}}$".format(j) for j in range(1, M + 1)]) param_df.loc[:, "nu"] = np.log10(param_df.loc[:, "nu"]) param_df = param_df.rename( {frm: to for frm, to in zip(col_names, var_names)}, axis=1) param_df = param_df.loc[:, var_names] fig, ax = plt.subplots(1, 1) ax.set_title("Parameter Posterior Marginals: " "$y \\sim \\mathcal{T}(\\nu, y_0 + X\\beta, \\sigma)$") sns.boxplot(data=param_df.melt(value_name="Posterior Samples", var_name="Parameter"), x="Parameter", y="Posterior Samples", ax=ax) if show: plt.show() return fig, ax
# https://www.programcreek.com/python/example/99828/xgboost.DMatrix #xgb_model = xgb.train(xgb_params, # dtrain=xgb.DMatrix(xtrain, ytrain), # evals=(xgb.DMatrix(xtest, ytest),"Valid")), # xgb_model = xgb_estimator) est = DecisionTreeRegressor() gs = GridSearchCV(est, cv=10, param_grid=hyper_params, verbose=2, n_jobs=n_jobs, scoring='r2') t0 = time.time() gs.fit(x_train, y_train) runtime = time.time() - t0 print("Complexity and bandwidth selected and model fitted in %.6f s" % runtime) train_score_mse = mean_squared_error(sc_y.inverse_transform(y_train), sc_y.inverse_transform(gs.predict(x_train))) train_score_mae = mean_absolute_error(sc_y.inverse_transform(y_train),sc_y.inverse_transform(gs.predict(x_train))) train_score_evs = explained_variance_score(sc_y.inverse_transform(y_train), sc_y.inverse_transform(gs.predict(x_train))) #train_score_me = max_error(sc_y.inverse_transform(y_train), sc_y.inverse_transform(gs.predict(x_train))) #train_score_msle = mean_squared_log_error(sc_y.inverse_transform(y_train), sc_y.inverse_transform(gs.predict(x_train))) test_score_mse = mean_squared_error(sc_y.inverse_transform(y_test), sc_y.inverse_transform(gs.predict(x_test))) test_score_mae = mean_absolute_error(sc_y.inverse_transform(y_test), sc_y.inverse_transform(gs.predict(x_test))) test_score_evs = explained_variance_score(sc_y.inverse_transform(y_test), sc_y.inverse_transform(gs.predict(x_test))) #test_score_me = max_error(sc_y.inverse_transform(y_test), sc_y.inverse_transform(gs.predict(x_test))) #test_score_msle = mean_squared_log_error(sc_y.inverse_transform(y_test), sc_y.inverse_transform(gs.predict(x_test))) test_score_r2 = r2_score(sc_y.inverse_transform(y_test), sc_y.inverse_transform(gs.predict(x_test))) print("The model performance for testing set") print("--------------------------------------") print('MAE is {}'.format(test_score_mae))
X_std = sc_x.fit_transform(X) y_std = sc_y.fit_transform(y) #y_std = sc_y.fit_transform(y[:, np.newaxis]).flatten() lr = LinearRegressionGD() lr.fit(X_std, y_std) fig = plt.figure() sns.reset_orig() plt.plot(range(1, lr.n_iter+1), lr.get_cost()) plt.ylabel('SSE') plt.xlabel('Epoch') fig.savefig('linearRegrssionGD.pdf') fig = plt.figure() lin_regplot(X_std, y_std, lr) plt.xlabel('Averge number of rooms [RM] (standardized)') plt.ylabel('Price in $1000s [MEDV] (standardized)') fig.savefig('regression.pdf') # inverse transform num_rooms_std = sc_x.transform([[5.0]]) price_std = lr.predict(num_rooms_std) print('price in $1000: %.3f' % (sc_y.inverse_transform(price_std)[0][0])) # estimating coefficient of regression model via scikit-learn from sklearn.linear_model import LinearRegression slr = LinearRegression() slr.fit(X, y) print('Slope: %.3f' % slr.coef_[0]) print('Intercepth: %.3f' % slr.intercept_)
# Feature Scaling from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() sc_y = StandardScaler() X = sc_X.fit_transform(X) y = sc_y.fit_transform(y) # Fitting SVR to the dataset from sklearn.svm import SVR # Default Kernel == 'rbf'. Done as a reminder. regressor = SVR(kernel='rbf') regressor.fit(X, y) # Predicting a new result y_pred = regressor.predict(6.5) y_pred = sc_y.inverse_transform(y_pred) # Visualising the SVR results plt.scatter(X, y, color='red') plt.plot(X, regressor.predict(X), color='blue') plt.title('Truth or Bluff (SVR)') plt.xlabel('Position level') plt.ylabel('Salary') plt.show() # Visualising the SVR results (for higher resolution and smoother curve) X_grid = np.arange( min(X), max(X), 0.01 ) # choice of 0.01 instead of 0.1 step because the data is feature scaled X_grid = X_grid.reshape((len(X_grid), 1)) plt.scatter(X, y, color='red')
data_thr['preds'] = pd.Series(preds).astype("category") color_key = ["red", "blue", "yellow", "grey", "black", "purple", "pink", "brown", "green", "orange"] # Spectral9 color_key = color_key[:len(set(preds))+1] covs = np.array([np.array(gmm.distributions[m].parameters[1]) for m in range(len(gmm.distributions))]) means = np.array([np.array(gmm.distributions[m].parameters[0]) for m in range(len(gmm.distributions))]) # transform cov for non-standardizeed data: covs = np.array([np.dot(np.diag(np.sqrt(scaler.var_)), np.dot(covs[j], np.diag(np.sqrt(scaler.var_)))) for j in range(covs.shape[0])]) means = np.array([scaler.inverse_transform(means[j].reshape(1, -1)).T for j in range(means.shape[0])]) # # uncomment to show interactive probas: # p = plot_probas(data_thr, probs) # plt.show() # p = interactive_img_ds(data_thr, 'rateCA', 'rate') # # waiting for InteractiveImage -> html # # pair plots with predicted classes and ellipses: # p = scatter_matrix(data_thr, spread=False, covs=covs, means=means, # color_key=color_key) # html = file_html(p, CDN, "pomegranate weighted gmm with 3 components")
def clustes_ert(): train_x= pd.read_csv(params.SAMPLE_PATH + 'train_x.csv') train_y = pd.read_csv(params.SAMPLE_PATH + 'train_y.csv') validation_x= pd.read_csv(params.SAMPLE_PATH + 'validation_x.csv') validation_y = pd.read_csv(params.SAMPLE_PATH + 'validation_y.csv') predict_x = pd.read_csv(params.SAMPLE_PATH + 'predict_x.csv') #数据标准化处理 ss_x = StandardScaler() ss_y = StandardScaler() train_x = ss_x.fit_transform(train_x) train_y = ss_y.fit_transform(train_y) train_x = pd.DataFrame(train_x) train_y = pd.DataFrame(train_y) validation_x = pd.DataFrame(ss_x.transform(validation_x)) validation_y = pd.DataFrame(ss_y.transform(validation_y)) predict_x = pd.DataFrame(ss_x.transform(predict_x)) clusters_label = shop_clusters() train_x['clusters_label'] = clusters_label train_y['clusters_label'] = clusters_label validation_x['clusters_label'] = clusters_label validation_y['clusters_label'] = clusters_label predict_x['clusters_label'] = clusters_label validation_x['iid'] = pd.Series(np.arange(1, 2001)) predict_x['iid'] = pd.Series(np.arange(1, 2001)) #train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.25, random_state=33) result_validation = [] result_predict = [] for i in range(4): cluster_x = train_x[train_x['clusters_label'] == i] cluster_y = train_y[train_y['clusters_label']==i] cluster_x = cluster_x.drop('clusters_label', axis=1) cluster_y = cluster_y.drop('clusters_label', axis=1) x_validation = validation_x[validation_x['clusters_label'] == i] x_validation_iid = x_validation['iid'] x_validation = x_validation.drop(['clusters_label','iid'], axis=1) y_validation = validation_y[validation_y['clusters_label'] == i] y_validation = y_validation.drop('clusters_label', axis=1) x_predict = predict_x[predict_x['clusters_label'] == i] x_predict_iid = x_predict['iid'] x_predict = x_predict.drop(['clusters_label','iid'], axis=1) y_validation_predict = ert(cluster_x.values, cluster_y.values, x_validation.values) y_validation_predict = pd.DataFrame(y_validation_predict) y_predict = ert(x_validation.values, y_validation.values, x_predict.values) y_predict = pd.DataFrame(y_predict) y_validation_predict['iid'] = np.array(x_validation_iid) y_predict['iid'] = np.array(x_predict_iid) result_validation.append(y_validation_predict) result_predict.append(y_predict) result_validation = pd.concat(result_validation) result_validation.index = np.arange(result_validation.shape[0]) # 按照iid降序排列 result_validation = result_validation.sort_values(by='iid',ascending=True) result_validation = result_validation.drop('iid', axis=1) result_validation = (ss_y.inverse_transform(result_validation)).astype(int) # 评估模型性能 validation_y = validation_y.drop('clusters_label', axis=1) print "off_line error is:", model_value.value_mode(result_validation, validation_y) # 线下误差 result_predict = pd.concat(result_predict) result_predict.index = np.arange(result_predict.shape[0]) # 按照iid降序排列 result_predict = result_predict.sort_values(by='iid',ascending=True) result_predict = result_predict.drop('iid', axis=1) result_predict = pd.DataFrame((ss_y.inverse_transform(result_predict)).astype(int)) predict = pd.DataFrame(np.arange(1, result_predict.shape[0]+1), columns=['iid']) predict = predict.join(result_predict) predict = pd.merge(predict, predict, on='iid') if (not os.path.exists(params.OUTPUT_PATH)): os.mkdir(params.OUTPUT_PATH) predict.to_csv(params.OUTPUT_PATH + 'result_clusters_and_ert_by_three_weeks.csv', index=False, header=False) print predict