def test_scaler_without_centering():
    rng = np.random.RandomState(42)
    X = rng.randn(4, 5)
    X[:, 0] = 0.0  # first feature is always of zero
    X_csr = sparse.csr_matrix(X)
    X_csc = sparse.csc_matrix(X)

    assert_raises(ValueError, StandardScaler().fit, X_csr)

    null_transform = StandardScaler(with_mean=False, with_std=False, copy=True)
    X_null = null_transform.fit_transform(X_csr)
    assert_array_equal(X_null.data, X_csr.data)
    X_orig = null_transform.inverse_transform(X_null)
    assert_array_equal(X_orig.data, X_csr.data)

    scaler = StandardScaler(with_mean=False).fit(X)
    X_scaled = scaler.transform(X, copy=True)
    assert_false(np.any(np.isnan(X_scaled)))

    scaler_csr = StandardScaler(with_mean=False).fit(X_csr)
    X_csr_scaled = scaler_csr.transform(X_csr, copy=True)
    assert_false(np.any(np.isnan(X_csr_scaled.data)))

    scaler_csc = StandardScaler(with_mean=False).fit(X_csc)
    X_csc_scaled = scaler_csr.transform(X_csc, copy=True)
    assert_false(np.any(np.isnan(X_csc_scaled.data)))

    assert_equal(scaler.mean_, scaler_csr.mean_)
    assert_array_almost_equal(scaler.std_, scaler_csr.std_)

    assert_equal(scaler.mean_, scaler_csc.mean_)
    assert_array_almost_equal(scaler.std_, scaler_csc.std_)

    assert_array_almost_equal(
        X_scaled.mean(axis=0), [0., -0.01, 2.24, -0.35, -0.78], 2)
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])

    X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis0(X_csr_scaled)
    assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0))
    assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0))

    # Check that X has not been modified (copy)
    assert_true(X_scaled is not X)
    assert_true(X_csr_scaled is not X_csr)

    X_scaled_back = scaler.inverse_transform(X_scaled)
    assert_true(X_scaled_back is not X)
    assert_true(X_scaled_back is not X_scaled)
    assert_array_almost_equal(X_scaled_back, X)

    X_csr_scaled_back = scaler_csr.inverse_transform(X_csr_scaled)
    assert_true(X_csr_scaled_back is not X_csr)
    assert_true(X_csr_scaled_back is not X_csr_scaled)
    assert_array_almost_equal(X_csr_scaled_back.toarray(), X)

    X_csc_scaled_back = scaler_csr.inverse_transform(X_csc_scaled.tocsc())
    assert_true(X_csc_scaled_back is not X_csc)
    assert_true(X_csc_scaled_back is not X_csc_scaled)
    assert_array_almost_equal(X_csc_scaled_back.toarray(), X)
Beispiel #2
0
class PoissonRegression(Regressor):
    """
    calulate the solution using the Newton-Raphson formula(second order optimization). This method has a advantage that its weight update rule needs no learning rate alpha. And it convages quickly.
    """
    def __init__( self, features = range(231) ):
        Regressor.__init__(self)
        self.features = features
        self.weights = np.ones(len(features))
        self.xscaler = StandardScaler()
        self.yscaler = StandardScaler()

    def learn(self, Xtrain, ytrain):
        Xless = Xtrain[:, self.features]
        self.xscaler.fit(Xless)
        Xless = self.xscaler.transform(Xless)
        self.yscaler.fit(ytrain)
        ytrain = self.yscaler.transform(ytrain)
        itertimes = 20
        for i in range(itertimes):
            c = np.exp(np.dot(Xless, self.weights))
            gradient = np.dot(Xless.T, (ytrain - c))
            neg_hessian = np.dot(Xless.T, np.dot(np.diag(c), Xless))
            self.weights = self.weights + np.dot(np.linalg.inv(neg_hessian), gradient)

    def predict(self, Xtest):
        Xless = Xtest[:, self.features]
        Xless = self.xscaler.transform(Xless)
        ytest = np.exp(np.dot(Xless, self.weights))
        ytest = self.yscaler.inverse_transform(ytest)
        return ytest
    def knn_max_density(self, X, n_neighbors, step):

        ss = StandardScaler()
        ss.fit(X)
        X_standart = ss.transform(X)

        passed_points_indeces = range(len(X_standart))
        X_passed_standart = X_standart

        while len(X_passed_standart) > n_neighbors:

            knn = NearestNeighbors(n_neighbors=n_neighbors, leaf_size=100)
            knn.fit(X_passed_standart)
            knn_dists, knn_indeces = knn.kneighbors()

            knn_dists_mean = knn_dists.mean(axis=1)

            n_points = max(1, int(step * len(X_passed_standart)))
            passed_points_indeces = knn_dists_mean.argsort()[:-n_points]
            knn_dists_mean.sort()

            X_passed_standart = X_passed_standart[passed_points_indeces]
            
        X_passed = ss.inverse_transform(X_passed_standart)

        return X_passed
def background_model(x_train, method='mean', n_components=10):
	"""
	use data from x_train to create a model/image of the background
	:param x_train: a matrix with 1 row per image frame, each column represents a pixel
		PCA is trained on this data
	:return: a vector that represents the background image
	"""
	# clean the data before pca and clustering (subtract mean, divide by st. dev.)
	scaler = StandardScaler().fit(x_train)
	x_train = scaler.transform(x_train)
	# use SVD instead of PCA, so that don't need to compute covariance
	eig = TruncatedSVD(n_components=n_components).fit(x_train)
	print sum(eig.explained_variance_ratio_)
	train = eig.transform(x_train)

	# define background as an aggregation of each pixel value in the principal component space
	# can't see much of a difference between mean and median
	if method == 'median':
		back_pca = np.median(train, axis=0)
	elif method == 'mean':
		back_pca = np.mean(train, axis=0)
	else:
		print "method must either be 'median' or 'mean'"
		return 1

	# transform to full sized matrix
	back_vec = eig.inverse_transform(back_pca)
	# add mean and variance back in
	back_vec = scaler.inverse_transform(back_vec)
	return back_vec
def test_scaler_1d():
    """Test scaling of dataset along single axis"""
    rng = np.random.RandomState(0)
    X = rng.randn(5)
    X_orig_copy = X.copy()

    scaler = StandardScaler()
    X_scaled = scaler.fit(X).transform(X, copy=False)
    assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
    assert_array_almost_equal(X_scaled.std(axis=0), 1.0)

    # check inverse transform
    X_scaled_back = scaler.inverse_transform(X_scaled)
    assert_array_almost_equal(X_scaled_back, X_orig_copy)

    # Test with 1D list
    X = [0., 1., 2, 0.4, 1.]
    scaler = StandardScaler()
    X_scaled = scaler.fit(X).transform(X, copy=False)
    assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
    assert_array_almost_equal(X_scaled.std(axis=0), 1.0)

    X_scaled = scale(X)
    assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
    assert_array_almost_equal(X_scaled.std(axis=0), 1.0)
    def get_track_params(self, X):

        ss = StandardScaler()
        ss.fit(X)

        transformed_tracks = ss.transform(X).mean(axis=0)
        tracks = ss.inverse_transform(transformed_tracks)

        return tracks, X.std(axis=0)
Beispiel #7
0
class GmmInterest(InterestModel):
    def __init__(self, conf, expl_dims, measure, n_samples=40, n_components=6, update_frequency=10):
        InterestModel.__init__(self, expl_dims)

        self.measure = measure
        self.bounds = conf.bounds[:, expl_dims]
        self.n_components = n_components
        self.scale_t = 1  # 1. / n_samples
        self.t = -self.scale_t * n_samples
        self.scale_x = conf.bounds[1, expl_dims] - conf.bounds[0, expl_dims]
        self.scale_measure = abs(measure(numpy.zeros_like(conf.bounds[0, :]), numpy.zeros_like(conf.bounds[0])))

        self.data = numpy.zeros((n_samples, len(expl_dims) + 2))
        self.n_samples = n_samples
        self.scaler = StandardScaler()
        self.update_frequency = update_frequency

        for _ in range(n_samples):
            self.update(rand_bounds(conf.bounds), rand_bounds(conf.bounds))

    def sample(self):
        x = self.gmm_choice.sample()
        x = self.scaler.inverse_transform(numpy.hstack(([0.0], x.flatten(), [0.0])))[1:-1]
        x = numpy.maximum(x, self.bounds[0, :])
        x = numpy.minimum(x, self.bounds[1, :])
        return x.T

    def update(self, xy, ms):
        measure = self.measure(xy, ms)
        self.data[self.t % self.n_samples, 0] = self.t
        self.data[self.t % self.n_samples, -1] = measure
        self.data[self.t % self.n_samples, 1:-1] = xy.flatten()[self.expl_dims]

        self.t += self.scale_t
        if self.t >= 0:
            if self.t % self.update_frequency == 0:
                self.update_gmm()

        return self.t, xy.flatten()[self.expl_dims], measure

    def update_gmm(self):
        scaled_data = self.scaler.fit_transform(self.data)

        self.gmm = GMM(n_components=self.n_components, covariance_type="full")
        self.gmm.fit(numpy.array(scaled_data))
        self.gmm_choice = self.gmm_interest()

    def gmm_interest(self):
        cov_t_c = numpy.array([self.gmm.covars_[k, 0, -1] for k in range(self.gmm.n_components)])
        cov_t_c = numpy.exp(cov_t_c)
        # cov_t_c[cov_t_c <= 1e-100] = 1e-100

        gmm_choice = self.gmm.inference([0], range(1, len(self.expl_dims) + 1), [1.0])
        gmm_choice.weights_ = cov_t_c
        gmm_choice.weights_ /= numpy.array(gmm_choice.weights_).sum()

        return gmm_choice
Beispiel #8
0
def main():

    df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data',
            header = None,
            sep = '\s+')
    df.columns = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM',
            'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B',
            'LSTAT', 'MEDV']
    print(df.head())

    # Select a subset of the features and plot the correlation between features
    cols = ['LSTAT', 'INDUS', 'NOX', 'RM', 'MEDV']
    sns.pairplot(df[cols], size=2.5);
    plt.title('Correlations between 5 features')
    plt.show()

    # Plot a heatmap of the same subset of features
    cm = np.corrcoef(df[cols].values.T)
    sns.set(font_scale=2.5)
    hm = sns.heatmap(cm,
            cbar = True,
            annot = True,
            square = True,
            fmt = '.2f',
            annot_kws = {'size': 15},
            yticklabels = cols,
            xticklabels = cols)
    plt.show()

    X = df[['RM']].values
    y = df['MEDV'].values

    sc_x = StandardScaler()
    sc_y = StandardScaler()

    X_std = sc_x.fit_transform(X)
    y_std = sc_y.fit_transform(y)
    
    lr = LinearRegressionGD()
    lr.fit(X_std, y_std)

    plt.plot(range(1, lr.n_iter + 1), lr.cost_)
    plt.ylabel('SSE')
    plt.xlabel('Epoch')
    plt.show()

    lin_regplot(X_std, y_std, lr)
    plt.xlabel('Average number of rooms [RM] (standardized)')
    plt.ylabel('Price in $1000\'s [MEDV] (standardized)')
    plt.show()
    
    # Example classification for a house with 5 rooms
    num_rooms_std = sc_x.transform([5.0])
    price_std = lr.predict(num_rooms_std)
    print("Price in $1000's: %.3f" % \
            sc_y.inverse_transform(price_std))
Beispiel #9
0
def clusterThose(G,eps=0.1,min_samples=4):
    ''' Scale the data and cluster'''
    scaler = StandardScaler(copy=True)
    X_centered = scaler.fit(G).transform(G)
    db = DBSCAN(eps, min_samples).fit( X_centered )
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    labels = db.labels_
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    X = scaler.inverse_transform(X_centered)
    return X, n_clusters_, labels, core_samples_mask
def kmeans_fitting(rows, train):
    x = get_feature_vector(rows, train)
    scaler = StandardScaler()
    scaler.fit(x)
    x = scaler.transform(x)
    model = cluster.MiniBatchKMeans(n_clusters = 6)
    model.fit(x)
    centers = model.cluster_centers_
    print centers
    centers = scaler.inverse_transform(centers)
    print centers
    return model, scaler
Beispiel #11
0
def DAEGO(X_s,H,P,batch_range):
	"""
	Parameters
	----------

	X_s: small class features

	H : layers (first layers shoud have same neurons as number of features)

	P : percent oversampling

	batch_range : size of minibatch


	Returns
	-------

	syn_Z: synthetic sample with same number of features as smaller class
	"""

	#normalization
	scaler=StdScaler()
	x_tr=scaler.fit_transform(X_s.astype(float))
	x_norm=norm(x_tr,axis=0)

	n_samples=int(X_s.shape[0]*P/100)
	print "generating %d samples" %(n_samples)

	norm_param=[LA.norm(x) for x in x_tr.T]
	X_init=np.random.standard_normal(size=(n_samples,X_s.shape[1]))
	x_init_tr=scaler.transform(X_init)
	x_ini_norm=norm(x_init_tr)
	ae=autoencoder(dimensions=H)
	learning_rate = 0.001
	optimizer = tf.train.AdamOptimizer(learning_rate).minimize(ae['cost'])
	sess = tf.Session()
	sess.run(tf.initialize_all_variables())
	n_epoch=100
	for epoch_i in range(n_epoch):
	    for start, end in zip(range(0, len(x_norm), batch_range),range(batch_range, len(x_norm), batch_range)):
	        input_ = x_norm[start:end]
	        sess.run(optimizer, feed_dict={ae['x']: input_, ae['corrupt_prob']: [1.0]})
	    s="\r Epoch: %d Cost: %f"%(epoch_i, sess.run(ae['cost'], 
	    	feed_dict={ae['x']: X_s, ae['corrupt_prob']: [1.0]}))
	    stderr.write(s)
	    stderr.flush()
	x_init_encoded = sess.run(ae['y'], feed_dict={ae['x']: x_ini_norm, ae['corrupt_prob']: [0.0]})
	sess.close()
	x_init_norminv=np.multiply(x_init_encoded,norm_param)
	syn_Z=scaler.inverse_transform(x_init_norminv)
	return syn_Z
def test_scaler_without_centering():
    rng = np.random.RandomState(42)
    X = rng.randn(4, 5)
    X[:, 0] = 0.0  # first feature is always of zero
    X_csr = sp.csr_matrix(X)

    scaler = StandardScaler(with_mean=False).fit(X)
    X_scaled = scaler.transform(X, copy=True)
    assert_false(np.any(np.isnan(X_scaled)))

    scaler_csr = StandardScaler(with_mean=False).fit(X_csr)
    X_csr_scaled = scaler_csr.transform(X_csr, copy=True)
    assert_false(np.any(np.isnan(X_csr_scaled.data)))

    assert_equal(scaler.mean_, scaler_csr.mean_)
    assert_array_almost_equal(scaler.std_, scaler_csr.std_)

    assert_array_almost_equal(
        X_scaled.mean(axis=0), [0., -0.01,  2.24, -0.35, -0.78], 2)
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])

    X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis0(X_csr_scaled)
    assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0))
    assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0))

    # Check that X has not been modified (copy)
    assert_true(X_scaled is not X)
    assert_true(X_csr_scaled is not X_csr)

    X_scaled_back = scaler.inverse_transform(X_scaled)
    assert_true(X_scaled_back is not X)
    assert_true(X_scaled_back is not X_scaled)
    assert_array_almost_equal(X_scaled_back, X)

    X_csr_scaled_back = scaler_csr.inverse_transform(X_csr_scaled)
    assert_true(X_csr_scaled_back is not X_csr)
    assert_true(X_csr_scaled_back is not X_csr_scaled)
    assert_array_almost_equal(X_scaled_back, X)
Beispiel #13
0
class InputScaler():

    def __init__(self):
        self.scaler = StandardScaler()
        
    def fit_transform(self, data):
        flat = numpy.vstack(data)
        self.scaler.fit(flat)
        return [ self.scaler.transform(X) for X in data ]
    
    def transform(self, data):
        return [ self.scaler.transform(X) for X in data ]
    
    def inverse_transform(self, data):
        return [ self.scaler.inverse_transform(X) for X in data ]
    def knn_max_density(self, X, n_neighbors, step):

        # ss = StandardScaler()
        # ss.fit(X)
        # X_standart = ss.transform(X)
        #
        # passed_points_indeces = range(len(X_standart))
        # X_passed_standart = X_standart
        #
        # while len(X_passed_standart) > n_neighbors:
        #
        #     knn = NearestNeighbors(n_neighbors=n_neighbors, leaf_size=100)
        #     knn.fit(X_passed_standart)
        #     knn_dists, knn_indeces = knn.kneighbors()
        #
        #     knn_dists_mean = knn_dists.mean(axis=1)
        #
        #     n_points = max(1, int(step * len(X_passed_standart)))
        #     passed_points_indeces = knn_dists_mean.argsort()[:-n_points]
        #     knn_dists_mean.sort()
        #
        #     X_passed_standart = X_passed_standart[passed_points_indeces]
        #
        # X_passed = ss.inverse_transform(X_passed_standart)

        ss = StandardScaler()
        ss.fit(X)
        X_standart = ss.transform(X)

        passed_points_indeces = range(len(X_standart))
        X_passed_standart = X_standart

        n_neighbors = min(n_neighbors, len(X_passed_standart) - 1)
        knn = NearestNeighbors(n_neighbors=n_neighbors, leaf_size=100)
        knn.fit(X_passed_standart)
        knn_dists, knn_indeces = knn.kneighbors()

        knn_dists_mean = knn_dists.mean(axis=1)

        max_dense_point = knn_dists_mean.argsort()[0]

        passed_points_indeces = list(knn_indeces[max_dense_point]) + [max_dense_point]

        X_passed_standart = X_passed_standart[passed_points_indeces]

        X_passed = ss.inverse_transform(X_passed_standart)

        return X_passed
Beispiel #15
0
def submit(args):
    """Run train-test experiment. """
    data = load_data(args['--data'])
    X_train = data['X_train']
    y_train = data['y_train']

    X_test = data['X_test']

    est = GradientBoostingRegressor(n_estimators=2000, verbose=1, max_depth=6,
                                    min_samples_leaf=9, learning_rate=0.02,
                                    max_features=33, random_state=1,
                                    subsample=1.0,
                                    loss='lad')

    model_cls = MODELS[args['<model>']]
    model = model_cls(est=est,
                      with_stationinfo=True,
                      with_date=True, with_solar=True,
                      with_mask=True,
                      intp_blocks=('nm_intp', 'nmft_intp', 'nm_intp_sigma'),
                      )

    print('_' * 80)
    print('Submit')
    print
    print model
    print
    print

    scaler = StandardScaler()
    if args['--scaley']:
        y_train = scaler.fit_transform(y_train.copy())

    t0 = time()
    model.fit(X_train, y_train)
    print('model.fit took %.fm' % ((time() - t0) / 60.))
    pred = model.predict(X_test)
    if args['--scaley']:
        pred = scaler.inverse_transform(pred)

    data = load_data(args['--data'])
    date_idx = data['X_test'].date
    date_idx = date_idx.map(lambda x: x.strftime('%Y%m%d'))
    stid = pd.read_csv('data/station_info.csv')['stid']
    out = pd.DataFrame(index=date_idx, columns=stid, data=pred)
    out.index.name = 'Date'
    out.to_csv('hk_19.csv')
    IPython.embed()
def get_rbf_nn_prediction(train_data, train_truth, test_data, test_truth, centers=8, spread=1, iter_id=0): 
    train_truth = train_truth[:,np.newaxis]
    test_truth = test_truth[:,np.newaxis]

    scaler = StandardScaler()
    train_truth = scaler.fit_transform(train_truth).ravel()
    test_truth = scaler.transform(test_truth).ravel()

    net = _get_nn(train_data.shape[1], spread=spread)

    _train_nn(net, train_data, train_truth, centers)

    out = net.activate_many(test_data)

    predicted = scaler.inverse_transform(np.array(out))
    return predicted.ravel()
def test_scaler_2d_arrays():
    """Test scaling of 2d array along first axis"""
    rng = np.random.RandomState(0)
    X = rng.randn(4, 5)
    X[:, 0] = 0.0  # first feature is always of zero

    scaler = StandardScaler()
    X_scaled = scaler.fit(X).transform(X, copy=True)
    assert_false(np.any(np.isnan(X_scaled)))

    assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0])
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
    # Check that X has been copied
    assert_true(X_scaled is not X)

    # check inverse transform
    X_scaled_back = scaler.inverse_transform(X_scaled)
    assert_true(X_scaled_back is not X)
    assert_true(X_scaled_back is not X_scaled)
    assert_array_almost_equal(X_scaled_back, X)

    X_scaled = scale(X, axis=1, with_std=False)
    assert_false(np.any(np.isnan(X_scaled)))
    assert_array_almost_equal(X_scaled.mean(axis=1), 4 * [0.0])
    X_scaled = scale(X, axis=1, with_std=True)
    assert_false(np.any(np.isnan(X_scaled)))
    assert_array_almost_equal(X_scaled.mean(axis=1), 4 * [0.0])
    assert_array_almost_equal(X_scaled.std(axis=1), 4 * [1.0])
    # Check that the data hasn't been modified
    assert_true(X_scaled is not X)

    X_scaled = scaler.fit(X).transform(X, copy=False)
    assert_false(np.any(np.isnan(X_scaled)))
    assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0])
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
    # Check that X has not been copied
    assert_true(X_scaled is X)

    X = rng.randn(4, 5)
    X[:, 0] = 1.0  # first feature is a constant, non zero feature
    scaler = StandardScaler()
    X_scaled = scaler.fit(X).transform(X, copy=True)
    assert_false(np.any(np.isnan(X_scaled)))
    assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0])
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
    # Check that X has not been copied
    assert_true(X_scaled is not X)
Beispiel #18
0
class RootTransform:
    def __init__(self, root=0.5):
        self.root = root

    def fit(self, X):
        if numpy.any(X < 0):
            raise ValueError("Log Transform: All values must be greater than or equal to zero")
        xlog = (X + 1e-10) ** self.root
        self.scale = StandardScaler().fit(xlog)

    def transform(self, X):
        return self.scale.transform((X + 1e-10) ** self.root)

    def inverse_transform(self, X):
        xinv = self.scale.inverse_transform(X)
        xinv = xinv ** (1 / self.root) - 1e-10
        return xinv
Beispiel #19
0
class LogTransform:
    def __init__(self):
        pass

    def fit(self, X):
        if numpy.any(X < 0):
            raise ValueError("Log Transform: All values must be greater than or equal to zero")
        # xlog = numpy.log(X+1e-10)
        xlog = (X + 1e-10) ** 0.5
        self.scale = StandardScaler().fit(xlog)

    def transform(self, X):
        return self.scale.transform(numpy.sqrt(X + 1e-10))

    def inverse_transform(self, X):
        xinv = self.scale.inverse_transform(X)
        # xinv = numpy.exp(xinv)-1e-10
        xinv = xinv ** 2 - 1e-10
        return xinv
def classifyWithKmeans(num_clusters):
    client = MongoClient('localhost', 27017)
    db = client["pitchfx"]
    x = []

    for player in db.players.find():
        for year in range(2008, 2016):
            if player.get('h_%d' % year) == None or player.get('ab_%d' % year) < 100:
                continue
            x.append(kmeans_features(player, year))
    kmeans = KMeans(init='k-means++', n_clusters=num_clusters, n_init=10, random_state=1000)

    vec = DictVectorizer()
    scaler = StandardScaler()
    scaler.fit(vec.fit_transform(x).toarray())
    kmeans.fit(scaler.transform(vec.transform(x).toarray()))
    print json.dumps(vec.inverse_transform(scaler.inverse_transform(kmeans.cluster_centers_)), indent=4)
    for i in range(0,8):
        print 'cluster %d:' % i, list(kmeans.labels_).count(i)
    return (kmeans, scaler, vec)
Beispiel #21
0
class ImageScaler(object):
    """
    Thin wrapper around sklearn.preprocessing.StandardScaler that works on image
    (and maintain their shapes). Doing per-channel scaling/centering
    """
    def fit(self, img):
        """
        Args:
            img: (width, height, nchans)
        """
        self._scaler = StandardScaler().fit(img.reshape(-1, img.shape[2]))
        return self

    def transform(self, img):
        return self._scaler.transform(img.reshape(-1, img.shape[2])).reshape(*img.shape)

    def inverse_transform(self, img):
        return self._scaler.inverse_transform(img.reshape(-1, img.shape[2])).reshape(*img.shape)

    def __repr__(self):
        return 'ImageScaler(\n  %s\n  mean=%s\n  std=%s\n)' % (self._scaler, self._scaler.mean_, self._scaler.std_)
Beispiel #22
0
def create_probability_grid(x_min, x_max, y_min, y_max, scaler: StandardScaler, prob_granularity: float = 0.001):
    """
    Creates a np.meshgrid with (approximately) given granularity between values -5 and 5. Tries to keep the number of
    points reasonable.

    Also transforms the grid to longitude and latitude coordinates using given scaler.

    :param scaler: scaler to be used while transforming back to latitude and longitude
    :param prob_granularity: distance between each point in the grid (in degrees)
    :return: a tuple containing raw X, Y values and lat-lon vales
    """
    x_min_unsc, y_min_unsc = scaler.inverse_transform((x_min, y_min))
    x_max_unsc, y_max_unsc = scaler.inverse_transform((x_max, y_max))

    x_rng = np.arange(x_min_unsc, x_max_unsc, prob_granularity)
    y_rng = np.arange(y_min_unsc, y_max_unsc, prob_granularity)

    while len(x_rng) > 400 or len(y_rng) > 400:
        # print("Too many points ({}x{}), decreasing granularity.".format(len(x_rng), len(y_rng)))
        prob_granularity *= 1.25
        x_rng = np.arange(x_min_unsc, x_max_unsc, prob_granularity)
        y_rng = np.arange(y_min_unsc, y_max_unsc, prob_granularity)

    # print("Generated {}x{} coordinate points.".format(x_rng.shape[0], y_rng.shape[0]))
    X_lon, Y_lat = np.meshgrid(x_rng, y_rng)

    x = X_lon.ravel()
    y = Y_lat.ravel()

    coords = np.hstack((x[:, np.newaxis], y[:, np.newaxis]))
    scaled = scaler.transform(coords)

    X = scaled[:, 0].reshape(X_lon.shape)
    Y = scaled[:, 1].reshape(Y_lat.shape)

    return X, Y, X_lon, Y_lat
    go.Scatter(x=test[time_steps:].date,
               y=test_score_df.loss,
               mode='lines',
               name='Test Loss'))
fig.add_trace(
    go.Scatter(x=test[time_steps:].date,
               y=test_score_df.threshold,
               mode='lines',
               name='Threshold'))

fig.update_layout(showlegend=True)
fig.show()

anomallies = test_score_df[test_score_df.anomaly == True]
anomallies.head()

fig = go.Figure()
fig.add_trace(
    go.Scatter(x=test[time_steps:].date,
               y=scaler.inverse_transform(test[time_steps:].close),
               mode='lines',
               name='Close Price'))
fig.add_trace(
    go.Scatter(x=anomallies.date,
               y=scaler.inverse_transform(anomallies.close),
               mode='markers',
               name='Anomaly'))

fig.update_layout(showlegend=True)
fig.show()
Beispiel #24
0
X=sc_X.fit_transform(X)
sc_y=StandardScaler()
y=sc_y.fit_transform(y)

# Training the SVR model on the whole dataset
from sklearn.svm import SVR
regressor = SVR(kernel = 'rbf') # We can chose the kernel. Some kernel can learn linear and some non linear relationships
regressor.fit(X, y)

# Predicting a new result
# lets say we want to predict y when X=6.5
X=6.5
X=[[6.5]] #first, we convert X into a 2-d array because regressor.predict function expect a 2-d array as input
X=sc_X.transform(X) #second, we scale X that we want to predict because our model is built on scaled values of X which we did using sc_X scaler
y=regressor.predict(X) #third, we predict the value of y; keep in mind that this predicted value of y is in the scale that was applied to y
y=sc_y.inverse_transform(y) #fourth, we reverse the scaling that we applied on y using sc_y

# Visualising the SVR results
plt.scatter(sc_X.inverse_transform(X), sc_y.inverse_transform(y), color = 'red') # we show X and actual y; this will give us the real points in their original scale, we need to do this because we applied transforms to X and y earlier
plt.plot(sc_X.inverse_transform(X), sc_y.inverse_transform(regressor.predict(X)), color = 'blue') # we show X and predicted y; we use the regressor function to predict y values and then the sc_y.inverse_transform to unscale them
plt.title('Truth or Bluff (SVR)')
plt.xlabel('Position level')
plt.ylabel('Salary')
plt.show()

# Visualising the SVR results (for higher resolution and smoother curve)
X_grid = np.arange(min(sc_X.inverse_transform(X)), max(sc_X.inverse_transform(X)), 0.1)
X_grid = X_grid.reshape((len(X_grid), 1))
plt.scatter(sc_X.inverse_transform(X), sc_y.inverse_transform(y), color = 'red')
plt.plot(X_grid, sc_y.inverse_transform(regressor.predict(sc_X.transform(X_grid))), color = 'blue')
plt.title('Truth or Bluff (SVR)')
Beispiel #25
0
x = data.iloc[:, 1:-1].values
y = data.iloc[:, -1].values

y = y.reshape(len(y), 1)

from sklearn.preprocessing import StandardScaler
sc_x = StandardScaler()
sc_y = StandardScaler()
x = sc_x.fit_transform(x)
y = sc_y.fit_transform(y)

from sklearn.svm import SVR
reg = SVR(kernel='rbf')  # gaussian radial basis function kernel
reg.fit(x, y)

output = sc_y.inverse_transform(reg.predict(sc_x.transform([[6.5]])))
print(output)

plt.scatter(sc_x.inverse_transform(x), sc_y.inverse_transform(y), color='red')
plt.plot(sc_x.inverse_transform(x),
         sc_y.inverse_transform(reg.predict(x)),
         color='blue')
plt.title('Experience vs Salary')
plt.xlabel('Experience')
plt.ylabel('Salary')
plt.show()

x_grid = np.arange(min(x), max(x), 0.1)
x_grid = x_grid.reshape((len(x_grid), 1))
plt.scatter(x, y, color='red')
plt.plot(x_grid, reg.predict(x_grid), color='blue')
Beispiel #26
0
class DNN(object):
    def __init__(self,
                 num_layers_range: list = [1, 4, 10],
                 use_dropout: bool = False,
                 use_l2_regularization: bool = False):
        self.logger = logging.getLogger("AutoNet")

        self.num_layers_range = num_layers_range

        self.use_dropout = use_dropout
        self.use_l2_regularization = use_l2_regularization

        self.scalerX = StandardScaler()
        self.scalerY = StandardScaler()

    def fit(self,
            X,
            y,
            max_epochs: int,
            runcount_limit: int = 100,
            wc_limit: int = 60,
            config: Configuration = None,
            seed: int = 12345):

        X_all = None
        y_all = None
        for idx, (X_q, y_q) in enumerate(zip(X, y)):
            if idx == 0:
                X_all = X_q
                y_all = y_q
            else:
                X_all = np.vstack([X_all, X_q])
                y_all = np.hstack([y_all, y_q])

        def obj_func(config, instance=None, seed=None, pc=None):
            # continuing training if pc is given
            # otherwise, construct new DNN

            models = []
            losses = []

            for model_idx, [train_idx, valid_idx] in enumerate([[0, 3], [3, 0],
                                                                [1, 2], [2,
                                                                         1]]):

                X_train = X[train_idx]
                y_train = y[train_idx]

                X_train = self.scalerX.fit_transform(X_train)
                y_train = np.log10(y_train)
                y_train = self.scalerY.fit_transform(y_train.reshape(-1, 1))[:,
                                                                             0]

                X_valid, y_valid = X_all, y_all
                X_valid = self.scalerX.transform(X_valid)
                y_valid = np.log10(y_valid)
                y_valid = self.scalerY.transform(y_valid.reshape(-1, 1))[:, 0]

                if pc is None:

                    if model_idx == 0:
                        K.clear_session()
                    model = ParamFCNetRegression(
                        config=config,
                        n_feat=X_train.shape[1],
                        expected_num_epochs=max_epochs,
                        n_outputs=1,
                        verbose=1)
                else:
                    model = pc[model_idx]

                history = model.train(X_train=X_train,
                                      y_train=y_train,
                                      X_valid=X_valid,
                                      y_valid=y_valid,
                                      n_epochs=1)

                models.append(model)

                final_loss = history["val_loss"][-1]
                losses.append(final_loss)

            return np.mean(losses), {"model": models}

        taf = SimpleTAFunc(obj_func)
        cs = ParamFCNetRegression.get_config_space(
            num_layers_range=self.num_layers_range,
            use_l2_regularization=self.use_l2_regularization,
            use_dropout=self.use_dropout)

        print(cs)

        ac_scenario = Scenario({
            "run_obj": "quality",  # we optimize quality
            "runcount-limit": max_epochs * runcount_limit,
            "wallclock-limit": wc_limit,
            "cost_for_crash": 10,
            "cs": cs,
            "deterministic": "true",
            "abort_on_first_run_crash": False,
            "output-dir": ""
        })

        intensifier = Intensifier(tae_runner=taf,
                                  stats=None,
                                  traj_logger=None,
                                  rng=np.random.RandomState(42),
                                  run_limit=100,
                                  max_epochs=max_epochs)

        if isinstance(config, dict):
            config = fix_types(configuration=dict, configuration_space=cs)
            config = Configuration(configuration_space=cs, values=config)
        elif runcount_limit == 1:
            config = cs.get_default_configuration()
        else:
            smac = SMAC(scenario=ac_scenario,
                        tae_runner=taf,
                        rng=np.random.RandomState(seed),
                        intensifier=intensifier)

            smac.solver.runhistory.overwrite_existing_runs = True
            config = smac.optimize()

        print("Final Incumbent")
        print(config)

        X_all = self.scalerX.fit_transform(X_all)
        y_all = np.log10(y_all)
        y_all = self.scalerY.fit_transform(y_all.reshape(-1, 1))[:, 0]

        K.clear_session()

        start_time = time.time()

        model = ParamFCNetRegression(config=config,
                                     n_feat=X_all.shape[1],
                                     expected_num_epochs=max_epochs,
                                     n_outputs=1,
                                     verbose=1)

        history = model.train(X_train=X_all,
                              y_train=y_all,
                              X_valid=X_all,
                              y_valid=y_all,
                              n_epochs=max_epochs)

        print("Training Time: %f" % (time.time() - start_time))

        self.model = model

    def predict(self, X_test):

        X_test = self.scalerX.transform(X_test)

        y_pred = self.model.predict(X_test)

        y_pred = self.scalerY.inverse_transform(y_pred)
        y_pred = 10**y_pred

        y_pred = np.maximum(0.0005, y_pred)

        return y_pred
Beispiel #27
0
# Splitting the dataset into the Training set and Test set
"""from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)"""

# Feature Scaling
# As SVR model is not handling feature scaling, we need to manually handle this
# scenerio, we need to make train data and output data at same scale
from sklearn.preprocessing import StandardScaler
x_scaler = StandardScaler()
y_scaler = StandardScaler()
X = x_scaler.fit_transform(X)
y = y_scaler.fit_transform(y.reshape(-1, 1))

# let us start with svm model ,which is very stright forward
from sklearn.svm import SVR
# we need to define kernal and default kernal is rbf only but we will define it for more clearity
svr = SVR(kernel='rbf')
# train model
svr.fit(X, y)
#let us predict value, but one thing is important that we need to transform input
inputValue = 6.5
inputValueArr = np.array([[inputValue]])
predict = svr.predict(x_scaler.transform(inputValueArr))

predict = y_scaler.inverse_transform(predict)

plt.scatter(X, y, color='red')
plt.plot(X, svr.predict(X), color='green')
plt.title('Prediction Based on SVR')
plt.show()
Beispiel #28
0
X_train = X_normalizer.fit_transform(X_train)
X_test = X_normalizer.transform(X_test)
y_normalizer = StandardScaler()
y_train = y_normalizer.fit_transform(y_train)
y_test = y_normalizer.transform(y_test)

# 创建KNN训练对象,K值设置为2
knn = KNeighborsRegressor(n_neighbors=2)
# 开始训练
# ravel()将多维数据转为1*n的一维数据
knn.fit(X_train, y_train.ravel())

# 将测试数据传入训练好的knn对象进行预测
y_pred = knn.predict(X_test)
# 反标准化,将标准化后的数据恢复成原始倍数
y_pred_inv = y_normalizer.inverse_transform(y_pred)
y_test_inv = y_normalizer.inverse_transform(y_test)

# 以预测值为横坐标,真实值为纵坐标画二维点图
plt.scatter(y_pred_inv, y_test_inv)
plt.xlabel('Prediction')
plt.ylabel('Real value')

# 画出对称且经过原点的直线y=kx,用于区分蓝色点(预测值, 真实值)偏离100%正确多远
diagonal = np.linspace(500, 1500, 100)  # 生成从500到1500,100个数据的等差数列
plt.plot(diagonal, diagonal, '-r')
plt.xlabel('Predicted ask price')
plt.ylabel('Ask price')
plt.show()

print(y_pred_inv)
Beispiel #29
0
x_train, x_test, y_train, y_test = train_test_split( dp, indp, test_size= 0.2, random_state= 0)"""

# Feature scalling
from sklearn.preprocessing import StandardScaler
sc_x = StandardScaler()
sc_y = StandardScaler()
x = sc_x.fit_transform(x)
y = sc_y.fit_transform(y)

# Fitting the Regression to the dataset
from sklearn.svm import SVR
regresor = SVR(kernel='rbf')
regresor.fit(x, y)

# predicitng a new result with polynomial regresion
y_pred = sc_y.inverse_transform(
    regresor.predict(sc_x.transform(np.array([[6.5]]).reshape(1, 1))))

# visualising SVR regression
plt.scatter(x, y, c='b')
plt.plot(x, regresor.predict(x), c='r')
plt.title('salary vs possiotion (SVR)')
plt.xlabel('position')
plt.ylabel('salary')
plt.show()

# visualising svr regression
x_grid = np.arange(min(x), max(x), 0.1)
x_grid = x_grid.reshape((len(x_grid), 1))
plt.scatter(x, y, c='g')
plt.plot(x_grid, regresor.predict(x_grid), c='r')
plt.title('salary vs possiotion (SVR)')
Beispiel #30
0
param_grid = [{'n_neighbors': [5, 15, 52, 168], 'weights': ['uniform']}]
clf = GridSearchCV(neigh, param_grid, scoring='r2', cv=10, refit=True)  # scoring='neg_mean_squared_error'
clf.fit(X_scaled, Y_scaled)

# loading all data points (not just the inliers) -------------------------------
X = country_data.loc[:, columns_to_consider].values.reshape(-1, len(
    columns_to_consider))  # comment to ignore outliers in the predictions
X_scaled = scaler_X.transform(X)
X_scaled[:, 0] = FR_RD_scaler * X_scaled[:, 0]
Y = country_data.loc[:, ['Price', 'Demand']].values.reshape(-1, 2)  # comment to ignore outliers in the predictions
Y_scaled = scaler_Y.transform(Y)
# wind_data_year_tech = wind_data_year_tech[inliers == 1]          #uncomment to ignore outliers in the predictions

# prediction and error calculation for base model
Y_pred = clf.predict(X_scaled)
Y_pred_org_scale = scaler_Y.inverse_transform(Y_pred)
mse = mean_squared_error(Y[:, 0], Y_pred_org_scale[:, 0])
r2 = r2_score(Y[:, 0], Y_pred_org_scale[:, 0])
orders = np.argsort(X_scaled[:, 0].flatten())
X_scaled_sorted = X_scaled[orders]
X_sorted_org_scale = X[orders]
Y_pred_ordered = clf.predict(X_scaled_sorted)
Y_pred_ordered_org_scale = scaler_Y.inverse_transform(Y_pred_ordered)

# prediction and error calculation for reduced residual demand
X_reduced = np.copy(X)
X_reduced[:, 0] = X_reduced[:, 0] - res_gen
X_reduced_scaled = scaler_X.transform(X_reduced)
X_reduced_scaled[:, 0] = FR_RD_scaler * X_reduced_scaled[:, 0]
Y_pred_reduced = clf.predict(X_reduced_scaled)
Y_pred_reduced_org_scale = scaler_Y.inverse_transform(Y_pred_reduced)
Beispiel #31
0
            # plot.figure(figsize=(4, 4))
            plot.figure()
            plot.imshow(current_ard,
                        interpolation='nearest',
                        aspect='auto',
                        origin='upper')
            plot.colorbar()
            plot.title('Latent Dim {}'.format(q))
        plot.show()
        quit(0)

        # Inverse the normalization and view predicted image.
        show_plots = False
        save_plots = False
        if show_plots or save_plots:
            ground_truth = scaler.inverse_transform(test_data)
            mrd_predicted_mean = np.load(mrd_results_file)['predicted_mean']
            dp_gp_lvm_predicted_mean = np.load(
                dp_gp_lvm_results_file)['predicted_mean']
            mrd_predicted_images = scaler.inverse_transform(
                np.hstack((test_data[:, :num_observed_dimensions],
                           mrd_predicted_mean)))
            dp_gp_lvm_predicted_images = scaler.inverse_transform(
                np.hstack((test_data[:, :num_observed_dimensions],
                           dp_gp_lvm_predicted_mean)))
            # assert ground_truth.shape[0] == predicted_images.shape[0]
            for i in range(ground_truth.shape[0]):
                fig_size = (3, 2)  # (10, 5)
                fig, (ax1, ax2, ax3) = plot.subplots(nrows=1,
                                                     ncols=3,
                                                     sharey='row',
Beispiel #32
0
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)

# Feature Scaling
from sklearn.preprocessing import StandardScaler

sc_X = StandardScaler()
sc_y = StandardScaler()
X_train = sc_X.fit_transform(X_train)
y_train = sc_y.fit_transform(y_train)

# Training the SVR model on the Training set
from sklearn.svm import SVR

regressor = SVR(kernel='rbf')
regressor.fit(X_train, y_train)

# Predicting the Test set results
y_pred = sc_y.inverse_transform(regressor.predict(sc_X.transform(X_test)))
np.set_printoptions(precision=2)
print(
    np.concatenate(
        (y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1))

# Evaluating the Model Performance
from sklearn.metrics import r2_score

print("Model Score")
print(r2_score(y_test, y_pred))
X = data[:-1]
y = data[1:]

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False)

train_data_gen = TimeseriesGenerator(X_train, y_train, length=window_size, batch_size=batch_size, shuffle=False)
test_data_gen = TimeseriesGenerator(X_test, y_test, length=window_size, batch_size=batch_size, shuffle=False)

model = Sequential()
model.add(CuDNNGRU(4, input_shape=(window_size, 1,)))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
history = model.fit_generator(train_data_gen, epochs=epochs).history

index = [df['Open'][0]]
for i, d in enumerate(scaler.inverse_transform(data)):
    index.append(index[i] + d)

index_train = [df['Open'][0]]
for i, d in enumerate(scaler.inverse_transform(model.predict_generator(train_data_gen))):
    index_train.append(index_train[i] + d)

index_test = [index_train[-1]]
for i, d in enumerate(scaler.inverse_transform(model.predict_generator(test_data_gen))):
    index_test.append(index_test[i] + d)

begin = window_size
join = begin + len(index_train)
end = join + len(index_test)
plt.plot(index)
plt.plot(list(range(begin, join)), index_train)
Beispiel #34
0



# 从sklearn.neighbors导入KNeighborRegressor(K近邻回归器)。
from sklearn.neighbors import KNeighborsRegressor

# 初始化K近邻回归器,并且调整配置,使得预测的方式为平均回归:weights='uniform'。
uni_knr = KNeighborsRegressor(weights='uniform')
uni_knr.fit(X_train, y_train)
uni_knr_y_predict = uni_knr.predict(X_test)

# 初始化K近邻回归器,并且调整配置,使得预测的方式为根据距离加权回归:weights='distance'。
dis_knr = KNeighborsRegressor(weights='distance')
dis_knr.fit(X_train, y_train)
dis_knr_y_predict = dis_knr.predict(X_test)

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error


# 使用R-squared、MSE以及MAE三种指标对平均回归配置的K近邻模型在测试集上进行性能评估。
print('R-squared value of uniform-weighted KNeighorRegression:', uni_knr.score(X_test, y_test))
print( 'The mean squared error of uniform-weighted KNeighorRegression:', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(uni_knr_y_predict)))
print('The mean absoluate error of uniform-weighted KNeighorRegression', mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(uni_knr_y_predict)))


# 使用R-squared、MSE以及MAE三种指标对根据距离加权回归配置的K近邻模型在测试集上进行性能评估。
print('R-squared value of distance-weighted KNeighorRegression:', dis_knr.score(X_test, y_test))
print( 'The mean squared error of distance-weighted KNeighorRegression:', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(dis_knr_y_predict)))
print('The mean absoluate error of distance-weighted KNeighorRegression:', mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(dis_knr_y_predict))	)
from matplotlib.colors import ListedColormap
X_set, y_set = X_train, y_train
#X_set, y_set = X_test, y_test
X1_n, X2_n = np.meshgrid(
    np.arange(start=X_set[:, 0].min(),
              stop=X_set[:, 0].max() + 1,
              step=(abs(X_set[:, 0].min()) + abs(X_set[:, 0].max() + 1)) /
              1000),
    #step = 1),
    np.arange(start=X_set[:, 1].min(),
              stop=X_set[:, 1].max() + 1,
              step=(abs(X_set[:, 1].min()) + abs(X_set[:, 1].max() + 1)) /
              1000))
#step = 10000))
X_set, y_set = sc_X.inverse_transform(X_train), y_train
#X_set, y_set = sc_X.inverse_transform(X_test), y_test
X1, X2 = np.meshgrid(
    np.arange(start=X_set[:, 0].min(),
              stop=X_set[:, 0].max() + 10,
              step=(abs(X_set[:, 0].max() + 10 - abs(X_set[:, 0].min())) /
                    1000)),
    np.arange(
        start=X_set[:, 1].min(),
        stop=X_set[:, 1].max() + 10000,
        #step = 0.01))
        step=(abs(X_set[:, 1].max() + 10000 - abs(X_set[:, 1].min())) / 1000)))
plt.contourf(X1,
             X2,
             classifier.predict(np.array([X1_n.ravel(),
                                          X2_n.ravel()
Beispiel #36
0
"""
Created on Mon Apr 15 02:50:11 2019

@author: mohamed nabil
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
file = pd.read_csv("Position_Salaries.csv")

x = file.iloc[:, 1:2].values
y = file.iloc[:, 2:].values

#applying feature scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
x = sc_X.fit_transform(x)
sc_Y = StandardScaler()
y = sc_Y.fit_transform(y)
from sklearn.svm import SVR
# you should use a suitable kernel
model = SVR(kernel='rbf')
model.fit(x, y)

y_pred = model.predict(x)

plt.scatter(sc_X.inverse_transform(x), sc_Y.inverse_transform(y), color="blue")
plt.plot(sc_X.inverse_transform(x),
         sc_Y.inverse_transform(model.predict(x)),
         color="yellow")
plt.show()
    ssy = StandardScaler().fit(y_train)
    y_train_std = ssy.transform(y_train)
    y_test_std = ssy.transform(y_test)

    '''preproccessing-adding column for bias term '''
    ones = np.ones(X_train_std.shape[0]).reshape(-1, 1)
    X_train_std = np.concatenate((ones, X_train_std), axis=1)
    ones = np.ones(X_test_std.shape[0]).reshape(-1, 1)
    X_test_std = np.concatenate((ones, X_test_std), axis=1)

    # no ridge
    # train
    Lin = OrdinaryLinearRegression(Ridge=False)
    Lin.fit(X_train_std, y_train_std)
    y_pred_train_std = Lin.predict(X_train_std)
    y_pred_train = ssy.inverse_transform(y_pred_train_std)
    Base_MSE_train = Lin.score(y_train, y_pred_train)
    # test
    y_pred_test_std = Lin.predict(X_test_std)
    y_pred_test = ssy.inverse_transform(y_pred_test_std)
    Base_MSE_test = Lin.score(y_test, y_pred_test)
    # with ridge
    Lambda_list = np.arange(0, 2, 0.001)
    MSE_list = []
    for lambdaval in Lambda_list:
        Lin = OrdinaryLinearRegression(Ridge=True, Lambda=lambdaval)
        Lin.fit(X_train_std, y_train_std)
        # y_pred_train_std = Lin.predict(X_train_std)
        # y_pred_train = ssy.inverse_transform(y_pred_train_std)
        # Base_MSE_train_ridge = Lin.score(y_train, y_pred_train)
        # test
N, D = X_train.shape

y_train_A = y_train_A.reshape(-1)
y_train_B = y_train_B.reshape(-1)
y_test_A = y_test_A.reshape(-1)
y_test_B = y_test_B.reshape(-1)

# from sklearn.svm import SVR
# model_A = SVR(kernel = 'rbf')
# model_A.fit(X_train, y_train_A)

# model_B = SVR(kernel = 'rbf')
# model_B.fit(X_train, y_train_B)

y_pred_A = model_A.predict(X_test)
y_pred_A = objy.inverse_transform(y_pred_A)
y_test_A = objy.inverse_transform(y_test_A)
X_test = obj.inverse_transform(X_test)
print(y_test_A.shape)
print(y_pred_A.shape)

y_pred_B = model_B.predict(X_test)
y_pred_B = objy.inverse_transform(y_pred_B)
y_test_B = objy.inverse_transform(y_test_B)
X_test = obj.inverse_transform(X_test)
print(y_test_B.shape)
print(y_pred_B.shape)

shape = (174, 142, 1)
imageL = X_test[:, 0].reshape(shape)
imagea = y_pred_A.reshape(shape)
Beispiel #39
0
hist = model.fit(xtrain,
                 ytrain,
                 epochs=epo,
                 batch_size=64,
                 callbacks=[cbks],
                 validation_freq=epostep,
                 validation_data=(xtest, ytest),
                 verbose=2)
stop = time.process_time()
print("Print Time for taining: ", stop - start)

trainlossall = np.array(hist.history['mean_absolute_error'])
testlossall = np.array(hist.history['val_mean_absolute_error'])

# Predict LUMO with model
pred_test = scaler.inverse_transform(model.predict(xtest))
true_test = scaler.inverse_transform(ytest)
mae_valid = np.mean(np.abs(pred_test - true_test))

# Plot loss vs epochs
plt.figure()
plt.plot(np.arange(trainlossall.shape[0]),
         trainlossall,
         label='Training Loss',
         c='blue')
plt.plot(np.arange(epostep, epo + epostep, epostep),
         testlossall,
         label='Test Loss',
         c='red')
plt.scatter([trainlossall.shape[0]], [mae_valid],
            label="{0:0.4f} ".format(mae_valid) + "[" + data_unit + "]",
Beispiel #40
0
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
sc_Y = StandardScaler()
X = sc_X.fit_transform(X)
Y = sc_Y.fit_transform(Y)
#print(X)
#print(Y)
# Fitting SVR to the dataset
from sklearn.svm import SVR
regressor = SVR(kernel='rbf')
regressor.fit(X, Y)

# Predicting a new result
y_pred = regressor.predict(sc_X.transform(np.array([6.5]).reshape(-1, 1)))
print("X: 6.5, Y: %8.8f" % sc_Y.inverse_transform(y_pred))

# Visualising the Linear Regression results
plt.scatter(sc_X.inverse_transform(X), sc_Y.inverse_transform(Y), color='red')
# Making predicting curve more smoothly
X_grid = np.arange(min(X), max(X), 0.1)
X_grid = X_grid.reshape(-1, 1)
X = X_grid

plt.plot(sc_X.inverse_transform(X),
         sc_Y.inverse_transform(regressor.predict(X)),
         color='blue')
plt.title('Supporting Vector Regression')
plt.xlabel('Level')
plt.ylabel('Salary')
plt.show()
    plt.plot(np.arange(len(fake_ex)) * 20., fake_ex)
    plt.show()


#Store losses
losses = {'D': [], 'A': []}
acc = {'D': [], 'A': []}

#Main training loop
epochs = 50
batch_size = 40
for i in range(epochs):
    if (i % 5 == 0):
        fake_ex = G.predict(np.random.uniform(-1, 1, size=(1, 100)))
        fake_ex = tf.inverse_transform(fake_ex * scale)
        plot_losses(i, losses, fake_ex[0])
        print '%d of %d' % (i, epochs)

    #Make noise to feed into GAN, label with counterfeit 1's
    noise_net = np.random.uniform(-1, 1, size=(batch_size, 100))
    noise_label = np.ones(len(noise_net))

    #Freeze weights of D
    set_trainability(G, True)
    set_trainability(D, False)

    #Compilation to ensure they actually freeze
    A.compile(loss='binary_crossentropy', optimizer=optA, metrics=['accuracy'])
    D.compile(loss='binary_crossentropy', optimizer=optD, metrics=['accuracy'])
####    cluster the data
###############################################################

ll_arr = np.asarray(list(zip(df["lat"], df["lng"])), dtype="float64")

stdScaler = StandardScaler()
ll_arr = stdScaler.fit_transform(ll_arr)

k = int(math.sqrt(len(df.lat)))  # rule of thumb, can do better choosing
k_means = cluster.KMeans(n_clusters=k)
k_means.fit_predict(ll_arr)
k_means.fit_predict(ll_arr)

data_labels = k_means.labels_
data_cluster_centers = k_means.cluster_centers_
data_cluster_centers = stdScaler.inverse_transform(k_means.cluster_centers_)
n_clusters = len(data_cluster_centers)

data_num_each_cluster = np.zeros((n_clusters, 1))
for i in range(n_clusters):
    data_num_each_cluster[i, 0] = (data_labels == i).sum()

###############################################################
####    save results
###############################################################

records = {
    "labels": data_labels.tolist(),
    "centers": data_cluster_centers.tolist(),
    "size": data_num_each_cluster.tolist(),
    "date_created": datetime.datetime.today(),
Beispiel #43
0
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))

preds = regressor.predict(X_test)

#Test Set Metrics for Performance Evaluvation
#R2 Score metric
from sklearn.metrics import r2_score
print(r2_score(y_test, preds))

#MSE Score
#from sklearn.metrics import mean_squared_error
#print(mean_squared_error(y_test[],preds))

#MAE Score
from sklearn.metrics import mean_absolute_error
print(mean_absolute_error(y_test, preds))

# Visualization

y_test = sc.inverse_transform(y_test)
preds = sc.inverse_transform(preds)

#Plot of a Small Subset of the Test Set
plt.plot(y_test[0:720], color='blue', label='Real voltage')
plt.plot(preds[0:720], color='red', label='Predicted voltage')
plt.title('output - Univariate Single Step Forecasting')
plt.xlabel('Hours')
plt.ylabel('output')
plt.legend()
plt.show()
Beispiel #44
0
#plt.savefig('images/10_06.png', dpi=300)
plt.show()




print('Slope: %.3f' % lr.w_[1])
print('Intercept: %.3f' % lr.w_[0])




num_rooms_std = sc_x.transform(np.array([[5.0]]))
price_std = lr.predict(num_rooms_std)
print("Price in $1000s: %.3f" % sc_y.inverse_transform(price_std))



# ## Estimating the coefficient of a regression model via scikit-learn







slr = LinearRegression()
slr.fit(X, y)
y_pred = slr.predict(X)
print('Slope: %.3f' % slr.coef_[0])
"""

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

data = pd.read_csv("Position_Salaries.csv")
x = data.iloc[:, 1:2].values
y = data.iloc[:, 2].values

from sklearn.preprocessing import StandardScaler

sc_x = StandardScaler()
sc_y = StandardScaler()
x = sc_x.fit_transform(x)
y = sc_y.fit_transform(np.reshape(y, (10, 1)))

from sklearn.svm import SVR

regressor = SVR(kernel='rbf')
regressor.fit(x, y)

y_pred = sc_y.inverse_transform(regressor.predict(sc_x.transform([[6.5]])))

plt.scatter(x, y, color='red')
plt.plot(x, regressor.predict(x), color='blue')
plt.title('truth or bluff (SVR)')
plt.xlabel('position level')
plt.ylabel('salary')
plt.show()
Beispiel #46
0
with model.session:
    model.train(trnX_L=trnX_L,
                trnXs_L=trnXs_L,
                trnY_L=trnY_L,
                trnX_U=trnX_U,
                trnXs_U=trnXs_U,
                valX_L=valX_L,
                valXs_L=valXs_L,
                valY_L=valY_L,
                valX_U=valX_U,
                valXs_U=valXs_U)
    model.saver.save(model.session, save_uri)

    ## property prediction performance
    tstY_hat = scaler_Y.inverse_transform(model.predict(tstX))

    for j in range(dim_y):
        print([j, mean_absolute_error(tstY[:, j], tstY_hat[:, j])])

    ## unconditional generation
    for t in range(10):
        smi = model.sampling_unconditional()
        print([t, smi, get_property(smi)])

    ## conditional generation (e.g. MolWt=250)
    yid = 0
    ytarget = 250.
    ytarget_transform = (ytarget - scaler_Y.mean_[yid]) / np.sqrt(
        scaler_Y.var_[yid])
Beispiel #47
0
labelencoder = LabelEncoder()
X[:, 2] = labelencoder.fit_transform(X[:, 2])
Z[:, 2] = labelencoder.fit_transform(Z[:, 2])

onehotencoder = OneHotEncoder(categorical_features=[2])
X = onehotencoder.fit_transform(X).toarray()
Z = onehotencoder.fit_transform(Z).toarray()

from sklearn.preprocessing import StandardScaler
fsx = StandardScaler()
fsy = StandardScaler()
X = fsx.fit_transform(X)
Z = fsx.transform(Z)
y = fsy.fit_transform(y)

from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators=10, criterion='mse')
regressor = regressor.fit(X, y)

y_pred = regressor.predict(Z)
y_pred = fsy.inverse_transform(y_pred)
new_dataset = test_dataset.drop([
    'Item_Weight', 'Item_Visibility', 'Item_Fat_Content', 'Item_Type',
    'Item_MRP', 'Outlet_Establishment_Year', 'Outlet_Size',
    'Outlet_Location_Type', 'Outlet_Type'
],
                                axis=1)
new_dataset['Item_Outlet_Sales'] = y_final

new_dataset.to_csv('C:/Users/bhupe/Desktop/Big Mart/Prt.csv', index=False)
Beispiel #48
0
class GaussianProcessRegression(BaseEstimator, RegressorMixin, StanCacheMixin):
    def __init__(self,
                 n_jobs=-1,
                 warmup=1000,
                 samples_per_chain=1000,
                 n_chains=4,
                 normalize=True,
                 max_samples_mem=500):
        BaseEstimator.__init__(self)
        StanCacheMixin.__init__(self, MODEL_DIR)

        self.stan_model, self.predict_model = self._load_compiled_models()

        # The control parameters for NUTS, most are left as default
        control = {
            "metric": "diag_e",  # Type of mass matrix (diag_e default)
            "stepsize_jitter": 0.05,  # Slight randomization of stepsizes
            "adapt_engaged": True,
            "adapt_gamma": 0.05,  # Regularization scale
            "adapt_delta": 0.8,  # Target acceptance probability (.8 default)
            "adapt_kappa": 0.75,  # Relaxation exponent
            "adapt_t0": 10,  # Adaptation iteration offset
            "adapt_init_buffer": 75,  # First fast adapt period
            "adapt_term_buffer": 50,  # Last fast adapt period
            "adapt_window": 25,  # First slow adapt period
            "max_treedepth": 10,  # N_leapfrog ~ 2**max_treedepth
        }

        self.stan_fitting_kwargs = {
            "chains": n_chains,
            "iter": samples_per_chain + warmup,
            "warmup": warmup,
            "init": "random",
            "init_r": 1.0,
            "n_jobs": n_jobs,
            "control": control
        }

        self._fit_results = None
        self._fit_X = None
        self.normalize = normalize
        self.max_samples_mem = max_samples_mem

        if normalize:
            self._y_ss = StandardScaler(with_mean=True)
            self._X_ss = StandardScaler()
        return

    def _posterior(self, X, **stan_fitting_kwargs):
        N, M = X.shape
        Xt = self._fit_X
        Nt = Xt.shape[0]

        if self.normalize:
            X = self._X_ss.transform(X)

        y0, alpha, rho, nu, f, sigma = self._get_param_posterior()

        # Ensure we don't use an excessive amount of memory
        mem_samples = (len(y0) * 8 * N**2) / 1e6
        ss = int(1 + (mem_samples // self.max_samples_mem))  # Subsample
        K = len(y0[::ss])

        data = {
            "Nt": Nt,
            "N": N,
            "M": M,
            "K": K,
            "X": X,
            "Xt": Xt,
            "alpha": alpha[::ss],
            "rho": rho[::ss],
            "nu": nu[::ss],
            "sigma": sigma[::ss],
            "f": f[::ss],
            "y0": y0[::ss]
        }
        fit_kwargs = self._setup_predict_kwargs(data, stan_fitting_kwargs)
        fit_kwargs["iter"] = 1
        fit_kwargs["chains"] = 1

        predictions = self.predict_model.sampling(**fit_kwargs,
                                                  algorithm="Fixed_param")
        y_samples = predictions.extract("y_samples")["y_samples"][0, ...]
        y_hat = predictions.extract("y_hat")["y_hat"].ravel()

        if self.normalize:
            y_samples = np.vstack(
                [self._y_ss.inverse_transform(y_s) for y_s in y_samples])
            y_hat = self._y_ss.inverse_transform(y_hat)
        return y_hat, y_samples

    def _get_param_posterior(self):
        if self._fit_results is None:
            raise NotFittedError("Model isn't fit!")
        df = self._fit_results.to_dataframe()

        y0 = df.loc[:, "y0"].to_numpy()
        alpha = df.loc[:, "alpha"].to_numpy()
        rho = df.loc[:, "rho"].to_numpy()
        nu = df.loc[:, "nu"].to_numpy()
        sigma = df.loc[:, "sigma"].to_numpy()

        f = df.loc[:, [c for c in df.columns if c[:2] == "f["]].to_numpy()
        return y0, alpha, rho, nu, f, sigma

    def fit(self, X, y, sample_weight=None, **stan_fitting_kwargs):
        if sample_weight is not None:
            raise NotImplementedError("sampling weighting is not implemented.")
        N, M = X.shape

        if self.normalize:
            y = self._y_ss.fit_transform(y)
            X = self._X_ss.fit_transform(X)

        y = y.ravel()
        data = {"N": N, "M": M, "X": X, "y": y}

        pars = ["y0", "alpha", "rho", "nu", "sigma", "f"]

        stan_fitting_kwargs.update({"pars": pars})
        fit_kwargs = self._setup_predict_kwargs(data, stan_fitting_kwargs)
        self._fit_results = self.stan_model.sampling(**fit_kwargs)
        self._fit_X = X

        print(
            self._fit_results.stansummary(
                pars=["y0", "alpha", "rho", "nu", "sigma"],
                probs=[0.1, 0.5, 0.9]))
        return

    def predict(self, X, ret_posterior=False, **stan_fitting_kwargs):
        y_hat, y_samples = self._posterior(X, **stan_fitting_kwargs)
        if ret_posterior:
            return y_hat, y_samples
        return y_hat

    def plot_posterior_params(self, show=False):
        """
        A helper method to plot the posterior parameter distribution.
        Will raise an error if .fit hasn't been called.
        """
        param_df = self._fit_results.to_dataframe()
        col_names = ["y0", "alpha", "rho", "nu", "sigma"]
        var_names = [
            "$y_0$", "$\\alpha$", "$\\rho$", "$\\mathsf{log}_{10}(\\nu)$",
            "$\\sigma$"
        ]

        param_df.loc[:, "nu"] = np.log10(param_df.loc[:, "nu"])
        param_df = param_df.loc[:, col_names]
        param_df = param_df.rename(
            {frm: to
             for frm, to in zip(col_names, var_names)}, axis=1)

        fig, ax = plt.subplots(1, 1)
        ax.set_title(
            "Parameter Posterior Marginals: "
            "$y \\sim \\mathcal{T}(\\nu, y_0 + \mathcal{GP}(\\alpha, \\rho), "
            "\\sigma)$")
        sns.boxplot(data=param_df.melt(value_name="Posterior Samples",
                                       var_name="Parameter"),
                    x="Parameter",
                    y="Posterior Samples",
                    ax=ax)
        if show:
            plt.show()
        return fig, ax
clf = LogisticRegression()
scaler = StandardScaler()

# create a linear model with LogisticRegression
model = LinearModel(clf)

# fit the classifier on MEG data
X = scaler.fit_transform(meg_data)
model.fit(X, labels)

# Extract and plot spatial filters and spatial patterns
for name, coef in (('patterns', model.patterns_), ('filters', model.filters_)):
    # We fitted the linear model onto Z-scored data. To make the filters
    # interpretable, we must reverse this normalization step
    coef = scaler.inverse_transform([coef])[0]

    # The data was vectorized to fit a single model across all time points and
    # all channels. We thus reshape it:
    coef = coef.reshape(len(meg_epochs.ch_names), -1)

    # Plot
    evoked = EvokedArray(coef, meg_epochs.info, tmin=epochs.tmin)
    evoked.plot_topomap(title='MEG %s' % name)

###############################################################################
# Let's do the same on EEG data using a scikit-learn pipeline

X = epochs.pick_types(meg=False, eeg=True)
y = epochs.events[:, 2]
Beispiel #50
0
# Feature Scaling
# we dont have any training or test set
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
sc_y = StandardScaler()
X = sc_X.fit_transform(X)   #going to fit in matrix X. fit transform tool is going to scale X
y = sc_y.fit_transform(y)

# Fitting the Regression Model to the dataset
from sklearn.svm import SVR
regressor = SVR(kernel = 'rbf')
regressor.fit(X, y)

# Predicting a new result
#we need the original scale of salary also
y_pred = sc_y.inverse_transform(regressor.predict(sc_X.transform(np.array([[6.5]])))) #transform() needs array. 2 [[]] because there's only one cell
 
# Visualising the SVR results
plt.scatter(X, y, color = 'red')
plt.plot(X, regressor.predict(X), color = 'blue')
plt.title('Truth or Bluff (SVR)')
plt.xlabel('Position level')
plt.ylabel('Salary')
plt.show()

# Visualising the SVR results (for higher resolution and smoother curve)
X_grid = np.arange(min(X), max(X), 0.01) # choice of 0.01 instead of 0.1 step because the data is feature scaled
X_grid = X_grid.reshape((len(X_grid), 1))
plt.scatter(X, y, color = 'red')
plt.plot(X_grid, regressor.predict(X_grid), color = 'blue')
plt.title('Truth or Bluff (SVR)')
def main(argv):
    dbscan_heuristic_mode = False
    dpgmm_mode = False
    do_plot_clusters = False
    do_dump_clusters = False
    try:
        opts, args = getopt.getopt(argv,"hegdp")
    except getopt.GetoptError:
        print('elviz_cluster.py [-h] [-e] [-g] [-d] [-p]')
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print('elviz_cluster.py [-h] [-e]')
            print('  -h = help, -e = run dbscan' +
                  ' epsilon heuristic plot generation code')
            print('  -g = use a DPGMM for clustering')
            print('  -p = plot the clusters to a PDF file')
            print('  -d = dump the clusters to a text file')
            sys.exit()
        elif opt == '-e':
            dbscan_heuristic_mode = True
        elif opt == '-g':
            dpgmm_mode = True
        elif opt == '-p':
            do_plot_clusters = True
        elif opt == '-d':
            do_dump_clusters = True

    [elviz_data, combined_df] = read_pickle_or_CSVs(DATA_PICKLE, RAW_DATA_DIR)

    # Setup plotting limits
    print("determining plotting limits")
    limits = {"x": [combined_df['Average fold'].min(), MAX_AVG_FOLD],
              "y": [combined_df['Reference GC'].min(), combined_df['Reference GC'].max()]}
    # Below changed in favor of fixed MAX
    # limits["x"] = [combined_df['Average fold'].min(), combined_df['Average fold'].max()]
    # fixed MAX below

    print("normalizing data prior to clustering")
    # normalize the combined data to retrieve the normalization parameters
    scaler = StandardScaler().fit(combined_df[CLUSTER_COLUMNS])
    # serializing outputs

    if dbscan_heuristic_mode:
        print("making DBSCAN heuristic plots")
        dbscan_heuristic(elviz_data, scaler)
        os.sys.exit()

    print("serially processing files")
    for filename in elviz_data.keys():
        pdf_filename = filename.replace("csv", "pdf")
        # skip if the PDF already exists
        if os.path.isfile(RESULTS_DIR + pdf_filename):
            print("skiping file %s" % filename)
            continue
        print("processing file %s" % filename)

        df = elviz_data[filename]

        # create a multipage PDF for storing the plots
        with PdfPages(RESULTS_DIR + pdf_filename) as pdf:
            # find unique values of taxonomy columns
            dfgb = df.groupby(['Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species'])
            for key in dfgb.indices.keys():
                idx = dfgb.indices[key]
                tax_rows = df.iloc[idx]
                if len(tax_rows) < MIN_ROWS:
                    continue
                # normalize all dimensions to be used in clustering, e.g. GC, coverage, rpk
                # reuse the scaler we created from all of the data for the transform
                tax_rows_cluster_columns = scaler.transform(tax_rows[CLUSTER_COLUMNS])

                if not dpgmm_mode:
                    db = DBSCAN(eps=EPS, min_samples=MIN_SAMPLES)
                    db.fit(tax_rows_cluster_columns)

                    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
                    core_samples_mask[db.core_sample_indices_] = True
                    labels = db.labels_
                else:
                    db = mixture.DPGMM(n_components=DPGMM_N_COMPONENTS, n_iter=100,
                                       covariance_type='full', alpha=100, verbose=0)
                    db.fit(tax_rows_cluster_columns)
                    Y_ = db.predict(tax_rows_cluster_columns)
                    for i, (mean, covar) in enumerate(zip(
                        db.means_, db._get_covars())):
                        if not np.any(Y_ == i):
                            continue
                        #plt.scatter(X[Y_ == i, 0], X[Y_ == i, 1], .8, color=color)
                    labels = Y_
                    core_samples_mask = np.zeros_like(labels, dtype=bool)
                    core_samples_mask[:] = True
                            
                #print(labels)
                #print(type(labels))

                # number of clusters in labels, ignoring noise if present.
                n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

                if n_clusters_ < 1:
                    continue

                #print(tax_rows_cluster_columns)

                title = ', '.join(key)
                if (do_plot_clusters):
                    plot_clusters(pdf, scaler.inverse_transform(tax_rows_cluster_columns),
                              title, labels, core_samples_mask, limits)
                if (do_dump_clusters):
                    dump_clusters(filename, key, labels, tax_rows[CONTIG_COLUMN]);
Beispiel #52
0
est = ensemble.RandomForestRegressor()
gs = GridSearchCV(est,
                  cv=10,
                  param_grid=hyper_params,
                  verbose=2,
                  n_jobs=n_jobs,
                  scoring='r2')

t0 = time.time()
gs.fit(x_train, y_train.ravel())
runtime = time.time() - t0
print("Complexity and bandwidth selected and model fitted in %.6f s" % runtime)

train_score_mse = mean_squared_error(
    sc_y.inverse_transform(y_train),
    sc_y.inverse_transform(gs.predict(x_train)))
train_score_mae = mean_absolute_error(
    sc_y.inverse_transform(y_train),
    sc_y.inverse_transform(gs.predict(x_train)))
train_score_evs = explained_variance_score(
    sc_y.inverse_transform(y_train),
    sc_y.inverse_transform(gs.predict(x_train)))
train_score_me = max_error(sc_y.inverse_transform(y_train),
                           sc_y.inverse_transform(gs.predict(x_train)))
train_score_r2 = r2_score(sc_y.inverse_transform(y_train),
                          sc_y.inverse_transform(gs.predict(x_train)))

test_score_mse = mean_squared_error(sc_y.inverse_transform(y_test),
                                    sc_y.inverse_transform(gs.predict(x_test)))
test_score_mae = mean_absolute_error(
Beispiel #53
0
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
sc_y = StandardScaler()
X = sc_X.fit_transform(X)
y = sc_y.fit_transform(y)

# Fitting SVR to the dataset
from sklearn.svm import SVR
regressor = SVR(kernel = 'rbf')
regressor.fit(X, y)

# Predicting a new result
y_pred = regressor.predict(6.5)
y_pred = sc_y.inverse_transform(y_pred)

# Visualising the SVR results
plt.scatter(X, y, color = 'red')
plt.plot(X, regressor.predict(X), color = 'blue')
plt.title('Truth or Bluff (SVR)')
plt.xlabel('Position level')
plt.ylabel('Salary')
plt.show()

# Visualising the SVR results (for higher resolution and smoother curve)
X_grid = np.arange(min(X), max(X), 0.01) # choice of 0.01 instead of 0.1 step because the data is feature scaled
X_grid = X_grid.reshape((len(X_grid), 1))
plt.scatter(X, y, color = 'red')
plt.plot(X_grid, regressor.predict(X_grid), color = 'blue')
plt.title('Truth or Bluff (SVR)')
Beispiel #54
0
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)"""

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
sc_y = StandardScaler()
X = sc_X.fit_transform(X)
y = sc_y.fit_transform(y.reshape(-1, 1))

# Fitting SVR to the dataset
from sklearn.svm import SVR
regressor = SVR(kernel='rbf')
regressor.fit(X, y)

# Predicting a new result
y_pred = sc_y.inverse_transform(
    regressor.predict(sc_X.transform(np.array([[6.5]]))))

# Visualising the SVR results
plt.scatter(X, y, color='red')
plt.plot(X, regressor.predict(X), color='blue')
plt.title('Truth or Bluff (SVR)')
plt.xlabel('Position level')
plt.ylabel('Salary')
plt.show()

# Visualising the SVR results (for higher resolution and smoother curve)
X_grid = np.arange(
    min(X), max(X), 0.01
)  # choice of 0.01 instead of 0.1 step because the data is feature scaled
X_grid = X_grid.reshape((len(X_grid), 1))
plt.scatter(X, y, color='red')
Beispiel #55
0
class BayesLinearRegression(BaseEstimator, RegressorMixin, StanCacheMixin):
    def __init__(self,
                 n_jobs=-1,
                 warmup=1000,
                 samples_per_chain=1000,
                 n_chains=4,
                 normalize=True,
                 max_samples_mem=500):
        """
        An interface to the following stan model

        y0 ~ cauchy(0, 1);
        nu ~ cauchy(0, 1);
        sigma ~ normal(0, 1);  // half-normal
        lam ~ exponential(1);
        theta ~ normal(0, lam);
        y ~ student_t(nu, y0 + Q * theta, sigma);

        params:
          n_jobs: Number of cores to use
          warmup: Number of warmup iterations for HMC, roughly analagous
              to a burnin period.
          samples_per_chain: Number of samples to draw per chain
          n_chains: Number of chains (should run at least 2)
          normalize: Whether to normalize the data before feeding it
              to stan.  This is necessary as the priors in the model
              are fixed.
          max_samples_mem: A parameter to prevent blowing up all the
              memory when sampling the posterior predictive.
        """
        BaseEstimator.__init__(self)
        StanCacheMixin.__init__(self, MODEL_DIR)

        self.stan_model, self.predict_model = self._load_compiled_models()

        self.stan_fitting_kwargs = {
            "chains": n_chains,
            "iter_sampling": samples_per_chain,
            "iter_warmup": warmup,
            "inits": 1,
            "metric": "diag_e",
            "adapt_delta": 0.8
        }

        self._fit_results = None
        self.normalize = normalize
        self.max_samples_mem = max_samples_mem

        if normalize:
            self._y_ss = StandardScaler()
            self._X_ss = StandardScaler()
        return

    def get_results(self, params=None, results_obj=None):
        if results_obj is None:
            results_obj = self._fit_results

        param_df = results_obj.get_drawset(params)
        param_df = param_df.rename(
            {
                param: "[".join(param.split(".")) + "]"
                for param in param_df.columns if "." in param
            },
            axis="columns")
        return param_df

    def extract_ary(self, param, results_obj=None):
        param_df = self.get_results([param], results_obj)
        return param_df[param].to_numpy()

    def _posterior(self, X, **stan_fitting_kwargs):
        N, M = X.shape

        if self.normalize:
            X = self._X_ss.transform(X)

        y0, beta, sigma, nu = self._get_param_posterior()

        # Ensure we don't use an excessive amount of memory
        # TODO: max_samples_mem is a massive underestimate of
        # TODO: the amount of memory used, why?
        mem_samples = len(y0) * 8 * N / 1e6
        ss = int(1 + (mem_samples // self.max_samples_mem))  # Subsample
        K = len(y0[::ss])

        data = {
            "N": N,
            "M": M,
            "K": K,
            "beta": beta[::ss],
            "y0": y0[::ss],
            "sigma": sigma[::ss],
            "X": X,
            "nu": nu[::ss]
        }
        fit_kwargs = stan_fitting_kwargs
        fit_kwargs["iter_sampling"] = 1
        fit_kwargs["data"] = data
        fit_kwargs["fixed_param"] = True

        predictions = self.predict_model.sample(**fit_kwargs)
        y_samples = self.extract_ary("y", predictions)[0, ...]
        y_hat = self.extract_ary("y_hat", predictions).ravel()

        if self.normalize:
            y_samples = np.vstack(
                [self._y_ss.inverse_transform(y_s) for y_s in y_samples])
            y_hat = self._y_ss.inverse_transform(y_hat)
        return y_hat, y_samples

    def _get_param_posterior(self):
        if self._fit_results is None:
            raise NotFittedError("Model isn't fit!")
        df = self.get_results()

        M = sum(c[:4] == "beta" for c in df.columns)

        y0 = df.loc[:, "y0"].to_numpy()
        beta = df.loc[:, [f"beta[{j}]" for j in range(1, M + 1)]].to_numpy()
        sigma = df.loc[:, "sigma"].to_numpy()
        nu = df.loc[:, "nu"].to_numpy()

        return y0, beta, sigma, nu

    def fit(self, X, y, sample_weight=None, **stan_fitting_kwargs):
        """
        "Fit" the model, that is, sample from the posterior.

        params:
            X (n_examples, m_features): Regressors
            y (n_examples): The targets
            sample_weight: NotImplemented
            stan_fitting_kwargs: To be passed to pystan's .sampling method
        """
        if sample_weight is not None:
            raise NotImplementedError("sampling weighting is not implemented.")
        N, M = X.shape

        if self.normalize:
            y = self._y_ss.fit_transform(y)
            X = self._X_ss.fit_transform(X)

        y = y.ravel()
        data = {"N": N, "M": M, "X": X, "y": y}

        fit_kwargs = self._setup_predict_kwargs(data, stan_fitting_kwargs)
        self._fit_results = self.stan_model.sample(**fit_kwargs)
        print(self._fit_results.summary())
        print(self._fit_results.diagnose())
        print(self._fit_results)
        return

    def predict(self, X, ret_posterior=False, **stan_fitting_kwargs):
        """
        Produce samples from the predictive distribution.  This can be
        used for either prior predictive checks or for posterior
        predictions.

        params:
            X (n_examples, m_features): Regressors
            ret_posterior: Whether or not to return all the
                posterior samples.  If false, we only return the
                posterior mean, which is dramatically faster.
            stan_fitting_kwargs: kwargs for pystan's sampling method.

        returns:
            y_hat (n_examples), y_samples (k_samples, n_examples) -- (if
                ret_posterior=True)
            y_hat (n_examples) -- (otherwise)
        """
        y0, beta, _, _ = self._get_param_posterior()
        y0_mean = np.mean(y0)
        beta_mean = np.mean(beta, axis=0)
        if self.normalize:
            y_hat = y0_mean + self._X_ss.transform(X) @ beta_mean
            y_hat = self._y_ss.inverse_transform(y_hat)
        else:
            y_hat = y0_mean + X @ beta_mean

        if ret_posterior:
            y_hat, y_samples = self._posterior(X, **stan_fitting_kwargs)
            return y_hat, y_samples
        else:
            return y_hat

    def plot_posterior_params(self, show=False):
        """
        A helper method to plot the posterior parameter distribution.
        Will raise an error if .fit hasn't been called.
        """
        param_df = self.get_results()
        M = sum([c[:4] == "beta" for c in param_df.columns])
        col_names = (["y0", "sigma", "nu"] +
                     [f"beta[{j}]" for j in range(1, M + 1)])
        var_names = (["$y_0$", "$\\sigma$", "$\\mathsf{log}_{10}(\\nu)$"] +
                     ["$\\beta_{{{}}}$".format(j) for j in range(1, M + 1)])

        param_df.loc[:, "nu"] = np.log10(param_df.loc[:, "nu"])
        param_df = param_df.rename(
            {frm: to
             for frm, to in zip(col_names, var_names)}, axis=1)

        param_df = param_df.loc[:, var_names]

        fig, ax = plt.subplots(1, 1)
        ax.set_title("Parameter Posterior Marginals: "
                     "$y \\sim \\mathcal{T}(\\nu, y_0 + X\\beta, \\sigma)$")
        sns.boxplot(data=param_df.melt(value_name="Posterior Samples",
                                       var_name="Parameter"),
                    x="Parameter",
                    y="Posterior Samples",
                    ax=ax)
        if show:
            plt.show()
        return fig, ax
Beispiel #56
0
    # https://www.programcreek.com/python/example/99828/xgboost.DMatrix
    #xgb_model = xgb.train(xgb_params,
    #                      dtrain=xgb.DMatrix(xtrain, ytrain),
    #                      evals=(xgb.DMatrix(xtest, ytest),"Valid")),
    #                      xgb_model = xgb_estimator)

    est = DecisionTreeRegressor()
    gs  = GridSearchCV(est, cv=10, param_grid=hyper_params, verbose=2, n_jobs=n_jobs, scoring='r2')

    t0 = time.time()
    gs.fit(x_train, y_train)
    runtime = time.time() - t0
    print("Complexity and bandwidth selected and model fitted in %.6f s" % runtime)

    train_score_mse = mean_squared_error(sc_y.inverse_transform(y_train), sc_y.inverse_transform(gs.predict(x_train)))
    train_score_mae = mean_absolute_error(sc_y.inverse_transform(y_train),sc_y.inverse_transform(gs.predict(x_train)))
    train_score_evs = explained_variance_score(sc_y.inverse_transform(y_train), sc_y.inverse_transform(gs.predict(x_train)))
    #train_score_me  = max_error(sc_y.inverse_transform(y_train), sc_y.inverse_transform(gs.predict(x_train)))
    #train_score_msle = mean_squared_log_error(sc_y.inverse_transform(y_train), sc_y.inverse_transform(gs.predict(x_train)))

    test_score_mse = mean_squared_error(sc_y.inverse_transform(y_test), sc_y.inverse_transform(gs.predict(x_test)))
    test_score_mae = mean_absolute_error(sc_y.inverse_transform(y_test), sc_y.inverse_transform(gs.predict(x_test)))
    test_score_evs = explained_variance_score(sc_y.inverse_transform(y_test), sc_y.inverse_transform(gs.predict(x_test)))
    #test_score_me  = max_error(sc_y.inverse_transform(y_test), sc_y.inverse_transform(gs.predict(x_test)))
    #test_score_msle = mean_squared_log_error(sc_y.inverse_transform(y_test), sc_y.inverse_transform(gs.predict(x_test)))
    test_score_r2  = r2_score(sc_y.inverse_transform(y_test), sc_y.inverse_transform(gs.predict(x_test)))

    print("The model performance for testing set")
    print("--------------------------------------")
    print('MAE is {}'.format(test_score_mae))
	X_std = sc_x.fit_transform(X)
	y_std = sc_y.fit_transform(y)
	#y_std = sc_y.fit_transform(y[:, np.newaxis]).flatten()
	lr = LinearRegressionGD()
	lr.fit(X_std, y_std)

	fig = plt.figure()
	sns.reset_orig()
	plt.plot(range(1, lr.n_iter+1), lr.get_cost())
	plt.ylabel('SSE')
	plt.xlabel('Epoch')
	fig.savefig('linearRegrssionGD.pdf')

	fig = plt.figure()
	lin_regplot(X_std, y_std, lr)
	plt.xlabel('Averge number of rooms [RM] (standardized)')
	plt.ylabel('Price in $1000s [MEDV] (standardized)')
	fig.savefig('regression.pdf')

	# inverse transform
	num_rooms_std = sc_x.transform([[5.0]])
	price_std = lr.predict(num_rooms_std)
	print('price in $1000: %.3f' % (sc_y.inverse_transform(price_std)[0][0]))

	# estimating coefficient of regression model via scikit-learn
	from sklearn.linear_model import LinearRegression
	slr = LinearRegression()
	slr.fit(X, y)
	print('Slope: %.3f' % slr.coef_[0])
	print('Intercepth: %.3f' % slr.intercept_)
Beispiel #58
0
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
sc_y = StandardScaler()
X = sc_X.fit_transform(X)
y = sc_y.fit_transform(y)

# Fitting SVR to the dataset
from sklearn.svm import SVR
# Default Kernel == 'rbf'. Done as a reminder.
regressor = SVR(kernel='rbf')
regressor.fit(X, y)

# Predicting a new result
y_pred = regressor.predict(6.5)
y_pred = sc_y.inverse_transform(y_pred)

# Visualising the SVR results
plt.scatter(X, y, color='red')
plt.plot(X, regressor.predict(X), color='blue')
plt.title('Truth or Bluff (SVR)')
plt.xlabel('Position level')
plt.ylabel('Salary')
plt.show()

# Visualising the SVR results (for higher resolution and smoother curve)
X_grid = np.arange(
    min(X), max(X), 0.01
)  # choice of 0.01 instead of 0.1 step because the data is feature scaled
X_grid = X_grid.reshape((len(X_grid), 1))
plt.scatter(X, y, color='red')
Beispiel #59
0
data_thr['preds'] = pd.Series(preds).astype("category")

color_key = ["red", "blue", "yellow", "grey", "black", "purple", "pink",
             "brown", "green", "orange"]  # Spectral9
color_key = color_key[:len(set(preds))+1]

covs = np.array([np.array(gmm.distributions[m].parameters[1])
                 for m in range(len(gmm.distributions))])
means = np.array([np.array(gmm.distributions[m].parameters[0])
                  for m in range(len(gmm.distributions))])

# transform cov for non-standardizeed data:
covs = np.array([np.dot(np.diag(np.sqrt(scaler.var_)),
                        np.dot(covs[j], np.diag(np.sqrt(scaler.var_))))
                 for j in range(covs.shape[0])])
means = np.array([scaler.inverse_transform(means[j].reshape(1, -1)).T
                  for j in range(means.shape[0])])


# # uncomment  to show interactive probas:
# p = plot_probas(data_thr, probs)
# plt.show()

# p = interactive_img_ds(data_thr, 'rateCA', 'rate')
# # waiting for InteractiveImage -> html


# # pair plots with predicted classes and ellipses:
# p = scatter_matrix(data_thr, spread=False, covs=covs, means=means,
#                    color_key=color_key)
# html = file_html(p, CDN, "pomegranate weighted gmm with 3 components")
Beispiel #60
0
def clustes_ert():

    train_x= pd.read_csv(params.SAMPLE_PATH + 'train_x.csv')
    train_y = pd.read_csv(params.SAMPLE_PATH + 'train_y.csv')
    validation_x= pd.read_csv(params.SAMPLE_PATH + 'validation_x.csv')
    validation_y = pd.read_csv(params.SAMPLE_PATH + 'validation_y.csv')
    predict_x = pd.read_csv(params.SAMPLE_PATH + 'predict_x.csv')

    #数据标准化处理
    ss_x = StandardScaler()
    ss_y = StandardScaler()
    train_x = ss_x.fit_transform(train_x)
    train_y = ss_y.fit_transform(train_y)
    train_x = pd.DataFrame(train_x)
    train_y = pd.DataFrame(train_y)
    validation_x = pd.DataFrame(ss_x.transform(validation_x))
    validation_y = pd.DataFrame(ss_y.transform(validation_y))
    predict_x = pd.DataFrame(ss_x.transform(predict_x))

    clusters_label = shop_clusters()

    train_x['clusters_label'] = clusters_label
    train_y['clusters_label'] = clusters_label
    validation_x['clusters_label'] = clusters_label
    validation_y['clusters_label'] = clusters_label
    predict_x['clusters_label'] = clusters_label
    validation_x['iid'] = pd.Series(np.arange(1, 2001))
    predict_x['iid'] = pd.Series(np.arange(1, 2001))

    #train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.25, random_state=33)
    result_validation = []
    result_predict = []
    for i in range(4):
        cluster_x = train_x[train_x['clusters_label'] == i]
        cluster_y = train_y[train_y['clusters_label']==i]
        cluster_x = cluster_x.drop('clusters_label', axis=1)
        cluster_y = cluster_y.drop('clusters_label', axis=1)

        x_validation = validation_x[validation_x['clusters_label'] == i]
        x_validation_iid = x_validation['iid']
        x_validation = x_validation.drop(['clusters_label','iid'], axis=1)
        y_validation = validation_y[validation_y['clusters_label'] == i]
        y_validation = y_validation.drop('clusters_label', axis=1)

        x_predict = predict_x[predict_x['clusters_label'] == i]
        x_predict_iid = x_predict['iid']
        x_predict = x_predict.drop(['clusters_label','iid'], axis=1)

        y_validation_predict = ert(cluster_x.values, cluster_y.values, x_validation.values)
        y_validation_predict = pd.DataFrame(y_validation_predict)

        y_predict = ert(x_validation.values, y_validation.values, x_predict.values)
        y_predict = pd.DataFrame(y_predict)

        y_validation_predict['iid'] = np.array(x_validation_iid)
        y_predict['iid'] = np.array(x_predict_iid)

        result_validation.append(y_validation_predict)
        result_predict.append(y_predict)

    result_validation = pd.concat(result_validation)
    result_validation.index = np.arange(result_validation.shape[0])
    # 按照iid降序排列
    result_validation = result_validation.sort_values(by='iid',ascending=True)
    result_validation = result_validation.drop('iid', axis=1)
    result_validation = (ss_y.inverse_transform(result_validation)).astype(int)
    # 评估模型性能
    validation_y = validation_y.drop('clusters_label', axis=1)
    print "off_line error is:", model_value.value_mode(result_validation, validation_y)  # 线下误差

    result_predict = pd.concat(result_predict)
    result_predict.index = np.arange(result_predict.shape[0])
    # 按照iid降序排列
    result_predict = result_predict.sort_values(by='iid',ascending=True)
    result_predict = result_predict.drop('iid', axis=1)
    result_predict = pd.DataFrame((ss_y.inverse_transform(result_predict)).astype(int))


    predict = pd.DataFrame(np.arange(1, result_predict.shape[0]+1), columns=['iid'])
    predict = predict.join(result_predict)
    predict = pd.merge(predict, predict, on='iid')

    if (not os.path.exists(params.OUTPUT_PATH)):
        os.mkdir(params.OUTPUT_PATH)
    predict.to_csv(params.OUTPUT_PATH + 'result_clusters_and_ert_by_three_weeks.csv', index=False, header=False)

    print predict