Exemple #1
0
def PLSCrossValidation(n_components, trainSet, validationSet):
  pls = PLSRegression(n_components=n_components)
  pls.fit(trainSet[predictorList], trainSet['Apps'])
  predictPls = pls.predict(validationSet[predictorList])
  different = predictPls.flat - validationSet['Apps']
  error_rate = np.mean(different ** 2)
  return error_rate
Exemple #2
0
	def fit(self, predictors, predictands, locations, log=False, **kwargs):

		self.locations = locations
		self.models = []
		self.n = predictors['n']

		id = 0
		for location in locations:
			X = extract_n_by_n(predictors, location, **kwargs)
			Y = predictands[:,id]

			if log:
				Y = np.log(Y)

			#pca = PCA(n_components='mle', whiten=True)
			model = PLSRegression(n_components=2)
			
			model = model.fit(X,Y)
			#components = pca.components_
			#pca.components_ = components
			
			self.models.append(model)
			print "pls: ", location, model.score(X, Y), model.x_loadings_.shape, np.argmax(model.x_loadings_, axis=0)

			id += 1
Exemple #3
0
def build_model(X, y):
	# gbr = GradientBoostingRegressor(learning_rate= 0.03, n_estimators=2000, max_depth=8, subsample=0.9)
	# rf = RandomForestRegressor(n_estimators=200)
	# lr = LinearRegression(fit_intercept=True)
	# knr = KNeighborsRegressor(n_neighbors=10, weights='uniform')
	# svr = SVR(C=5.0, kernel='linear')
	pls = PLSRegression(n_components=35)
	return pls.fit(X, y)
Exemple #4
0
def Training(df,seed, yratio, xratio, index = 1):
	snp_matrix = np.array(df.values)
	xdim, ydim = snp_matrix.shape

	ydimlist = range(0,ydim)
	xdimlist = range(0,xdim)

	random.seed(seed)
	random.shuffle(ydimlist) # shuffle the individuals
	random.shuffle(xdimlist) # shuffle the SNPs	
	accuracy = 0

	snp_matrix_shuffle = np.copy(snp_matrix[:,ydimlist])
	snp_matrix_shuffle = np.copy(snp_matrix[xdimlist,:])
	snp_matrix_train = snp_matrix_shuffle[:,0:int(ydim*yratio)]
	snp_matrix_test = snp_matrix_shuffle[:,int(ydim*yratio):]

	snp_matrix_train_x = snp_matrix_train[0:int(xdim*xratio),:]
	snp_matrix_test_x = snp_matrix_test[0:int(xdim*xratio),:]

	for i in range(int(xdim*xratio), xdim):
		snp_matrix_train_y = snp_matrix_train[i,:]
		snp_matrix_test_y = snp_matrix_test[i,:]
		if index != 7:
			if index == 1:
				clf = AdaBoostClassifier(n_estimators= 100)
			elif index == 2:
				clf = RandomForestClassifier(n_estimators=100)
			elif index == 3:
				clf = linear_model.LogisticRegression(C=1e5)
			elif index == 4:
				clf = svm.SVC(kernel = 'rbf')
			elif index == 5:
				clf = svm.SVC(kernel = 'poly')
			else:
				clf = svm.SVC(kernel = 'linear')
			clf = clf.fit(snp_matrix_train_x.T, snp_matrix_train_y)
			Y_pred = clf.predict(snp_matrix_test_x.T)
			prediction = snp_matrix_test_y - Y_pred
			wrong = np.count_nonzero(prediction)
			tmp = 1 - (wrong + 0.0) / len(prediction)
			print tmp
			accuracy += tmp

	accuracy = accuracy / (xdim - int(xdim*xratio))

	if index == 7:
		pls2 = PLSRegression(n_components = 50, scale=False, max_iter=1000)
		snp_matrix_train_y = snp_matrix_train[int(xdim*xratio):,:]
		pls2.fit(snp_matrix_train_x.T,snp_matrix_train_y.T)
		snp_matrix_test_x = snp_matrix_test[0:int(xdim*xratio),:]
		snp_matrix_test_y = snp_matrix_test[int(xdim*xratio):,:]		
		Y_pred = transform(pls2.predict(snp_matrix_test_x.T))
		prediction = snp_matrix_test_y - Y_pred.T
		xdim, ydim = prediction.shape
		wrong = np.count_nonzero(prediction)
		accuracy = 1 - wrong / (xdim * ydim + 0.0)
	return accuracy
Exemple #5
0
def fit(predictors, predictands, log=False, **kwargs):
	
	model = PLSRegression(n_components=2)
	try:
		model.fit(predictors, predictands)
	except:
		return None

	return model
Exemple #6
0
def get_correlations(param, spec, wave):
    '''Returns correlations between spec and params by wavelengths'''
    # using PLS
    pls = PLSRegression(10)
    pls.fit(spec, param)
    
    #get corretalions
    nparam = param.shape[1]
    cor = pls.coefs*np.asarray([pls.x_std_]*nparam).T
    cor /= np.tile(pls.y_std_, (cor.shape[0],1))

    return cor
Exemple #7
0
class PLSPredictor:
    def __init__(self):
        self.pls2 = PLSRegression(n_components=2,
                                  scale=True,
                                  max_iter=500,
                                  tol=1e-06,
                                  copy=True)

    def predict(self, values):
        self.pls2.predict(values)

    def train(self, measured_values, screen_points):
        self.pls2.fit(measured_values, screen_points)
Exemple #8
0
def do_pls(X, Y):
    pls2 = PLSRegression(n_components=2)
    pls2.fit(X,Y)
    out = pls2.transform(X)
    print(out)
    print(out.shape)

    plt.title("PLS2")
    plt.xlabel("PL1")
    plt.ylabel("PL2")
    plt.grid();
    plt.scatter(out[:, 0], out[:, 1], c=Y, cmap='viridis')
    plt.savefig('pls.png', dpi=125)
Exemple #9
0
def pls_approach():
    from sklearn.cross_decomposition import PLSRegression

    (X, Y), cities = pull_xy_data()

    pls = PLSRegression()
    pls.fit(X, Y)

    plsX, plsY = pls.transform(X, Y)

    plot(plsX, cities, ["Lat01", "Lat02", "Lat03"], ellipse_sigma=1)

    return "OK What Now?"
Exemple #10
0
    def __one_pls(self, cat):

        np.seterr(all='raise')

        lcat = np.zeros(self.train_set['labels'].size)

        lcat[self.train_set['labels'] != cat] = -1
        lcat[self.train_set['labels'] == cat] = +1

        pls = PLSRegression(n_components=2, scale=False)

        pls.fit(self.train_set['data'], lcat)

        return pls
Exemple #11
0
def fit_base_model(classifiers, fully, dummyY, trainx, testx):
    """ Takes a list of classifiers and/or PLS regression and
    does dimension reduction by returning the predictions of the classifiers
    or first two scores of the PLS regression on bootstrapped subsamples of
    the data."""

    trainProbs = []
    testProbs = []

    iterations = 0
    for clf in classifiers:
        for i in range(clf[1]):
            iterations += 1
            print(iterations)
            print(clf[0])
            train_rows = np.random.choice(trainx.shape[0],
                                          round(trainx.shape[0] * base_prop),
                                          True)
            oob_rows = list(set(range(trainx.shape[0])) - set(train_rows))
            print(len(train_rows))
            print(len(oob_rows))
            x = trainx[train_rows, :]
            if clf[0] == 'PLS':
                y = dummyY[train_rows, :]
                mod = PLSRegression().fit(x, y)
                trainscores = mod.transform(trainx)
                testscores = mod.transform(testx)
                trainProbs.append(trainscores[:, 0])
                trainProbs.append(trainscores[:, 1])
                testProbs.append(testscores[:, 0])
                testProbs.append(testscores[:, 1])
            else:
                y = fully[train_rows]
                print('\t Fitting model...')
                mod = clf[0].fit(x, y)
                print('\t Predicting training results...')
                tpreds = mod.predict_proba(trainx)
                trainProbs.append(list(tpreds[:, 1]))
                print('\t Predicting test results...')
                testProbs.append(list(mod.predict_proba(testx)[:, 1]))
                print('\t OOB score: ' + str(log_loss(fully[oob_rows],
                                                      tpreds[oob_rows, :])))
    return trainProbs, testProbs
Exemple #12
0
def pls_regr(x, y):
    from sklearn.cross_decomposition import PLSRegression
    n = len(x[0])
    if n < 2:
        raise TypeError
    score = -999999999999
    pls = None
    '''
    for i in range(3, n):
        pls2 = PLSRegression(n_components=i)
        pls2.fit(x,y)
        cscore = pls2.score(x, y)
        #print i, cscore 
        if cscore > score:
            pls = pls2
            score = cscore
    '''
    pls = PLSRegression(n_components=5)
    pls.fit(x,y)
    return pls
Exemple #13
0
def train_PLSR(x_filename, y_filename, model_filename, n):
    """
    Train a PLSR model and save it to the model_filename.
    X and Y matrices are read from x_filename and y_filename.
    The no. of PLSR components is given by n. 
    """
    X = loadMatrix(x_filename)[0].todense()
    Y = loadMatrix(y_filename)[0].todense()
    if X.shape[0] != Y.shape[0]:
        sys.stderr.write("X and Y must have equal number of rows!\n")
        raise ValueError
    sys.stderr.write("Learning PLSR...")
    startTime = time.time()
    pls2 = PLSRegression(copy=True, max_iter=10000, n_components=n, scale=True, tol=1e-06)
    pls2.fit(X, Y)  
    model = open(model_filename, 'w') 
    pickle.dump(pls2, model, 1)
    model.close()
    endTime = time.time()
    sys.stderr.write(" took %ss\n" % str(round(endTime-startTime, 2)))  
    pass
def lex_function_learning( class_name,  hyper_vec ) :

		#pls2 = KernelRidge( kernel = "rbf", gamma= 100)
		#pls2 = KernelRidge( )
		pls2 = PLSRegression(n_components=50, max_iter=5000)

		X = extract_postive_features ( train_dataset[class_name][0], train_dataset[class_name][1] )			

		Y = []

		for hypo_vec in X :

			sub = hyper_vec-hypo_vec
			Y.append(sub) # Target = difference vector ( Hypernym_vector - Hyponym_vector )
			#Y.append(hyper_vec) # Target = Hypernym vector 

		pls2.fit( X, Y)	
		train_acc = pls2.score(X, Y)
		print "class = ", class_name, "train len = ", len(X)
		
		return pls2, train_acc, len(X)
def reduce_PLS(dataframe):
    PLS_file="data/pls_structure.pickle"
    selectedcolumn=[x for x in dataframe.columns if x not in ["id","click","device_id","device_ip"]]
    X=np.array(dataframe[selectedcolumn])
    y=np.array(dataframe["click"])
    if os.path.exists(PLS_file):
        stand_PLS=pickle.load(open(PLS_file,'rb'))
        print "PLS structure is loaded."
    else:
        stand_PLS=PLSRegression(n_components=10,scale=True)
        stand_PLS.fit(X, y[:,np.newaxis])
        stand_PLS.y_scores_=None
        stand_PLS.x_scores_=None
        pickle.dump(stand_PLS,open(PLS_file,"wb"))
        print "PLS transform structure is stored."
    T=stand_PLS.transform(X)
    print "PLS transformation is performed."
    return T
Exemple #16
0
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
X_atual = sc_X.transform(X_atual)

sc_y = StandardScaler()
y = y.reshape(-1, 1)
sc_y.fit(y)
y_train = y_train.reshape(-1, 1)
y_train = sc_y.transform(y_train)
y_test = y_test.reshape(-1, 1)
y_test = sc_y.transform(y_test)

# Regressão linear
from sklearn.cross_decomposition import PLSRegression
pls2 = PLSRegression(n_components=3, scale=False)
pls2.fit(X_train, y_train)

# Predicting the Test set results
y_pred = pls2.predict(X_test)
y_pred_atual = pls2.predict(X_atual)

# coeficientes
mean_squared_error(y_test, y_pred)
r2_score(y_test, y_pred)

df_atual = df[df['atletas.rodada_id'] == CURRENT_ROUND]
df_atual = df_atual[['atletas.apelido', 'atletas.clube.id.full.name', 'atletas.nome', 'atletas.posicao_id', \
                             'atletas.rodada_id', 'atletas.preco_num','atletas.pontos_num_sum_last5', 'atletas.media_num' ]]
df_atual['pred_score'] = y_pred_atual
df_atual.to_csv('predictions/predict-PLS.csv', encoding='utf-8')
# plt.show()

#Partial Least Squares Regression
from sklearn.cross_decomposition import PLSRegression
from sklearn.preprocessing import scale

X_train_scaled = scale(X_train)
X_test_scaled = scale(X_test)

#Performing Cross_Validation for PLS
mse = []
n=  len(X_train_scaled)
kf_10 = cross_validation.KFold(n,n_folds=10, shuffle=True, random_state=0)

for i in np.arange(1,17):
    plsregr = PLSRegression(n_components=i, scale=False)
    plsregr.fit(X_train_scaled,y_train)
    score = -1*cross_validation.cross_val_score(plsregr, X_train_scaled, y_train, cv=kf_10, scoring='mean_squared_error').mean()
    mse.append(score)

plt.plot(np.arange(1,17), np.array(mse), '-v')
plt.title("PLS: MSE vs. Principal Components")
plt.xlabel('Number of principal components in PLS regression')
plt.ylabel('MSE')
plt.xlim((-0.2, 17.2))

#Based off of the plot, 12 principal components minimized MSE
plsregr_test = PLSRegression(n_components=12, scale=False)
plsregr_test.fit(X_train_scaled, y_train)
MSE_PLS = np.mean((plsregr_test.predict(X_test_scaled) - y_test) ** 2)
# print "Mean Squared Error: ", MSE_PLS
def bestpls(vipMatrix, X, Y, V):

    ###########################
    #bestR2 = -10000
    #lv_best = 1
    #position = 1
    ###########################
    bestR2 = vipMatrix[0][1]
    lv_best = vipMatrix[0][3]
    position = 0
	###########################
	
	
    #for i in range (len(vipMatrix)): 
    #    print vipMatrix[i] 
		
    for entries in range (len(vipMatrix)):
        #print vipMatrix[entries][1], "=?=", bestR2 #############
        if vipMatrix[entries][1] > bestR2:   
            position = entries
            bestR2 = vipMatrix[entries][1]
            lv_best = vipMatrix[entries][3]

    #################################################################################################qq
    variables = []    
    for i in range (1, position): # not position + 1, as the vipMatrix[position] holds the next variable to be removed
        variables.append(vipMatrix[i][0])
    #print "VAR TO BE REMOVED: ", variables
    V_new_Indices = []
    for i in variables: # removed variable names in random order
        V_new_Indices.append(V.index(i))

    #if V == sorted(V):
    #    print "\nV ok!\n"

    # keep names == separate
    V_new = deepcopy(V)
    for i in variables:
        V_new.remove(i)
        
    X_new = []
    for i in range (len(X)):
        X_new.append([])

    variables_sent = [] ####
    for i in range (len(X)):
        for j in range (len(V)):
            if j not in V_new_Indices:
                #if V[j] not in variables_sent: ####
                #    variables_sent.append(V[j])####
                X_new[i].append(X[i][j])

    # epic test
    if not V_new == sorted(V_new):
        return base64.b64encode("tobulo"), [], [], 0
    #else:
    #    print "v_new ok!"


    #validity tests
    #for i in range (len (variables_sent)):
    #    if variables_sent[i] == V_new[i]:
    #        print "ok", i
    #print "var: ", len(V), "selected: ", len(V_new), "data (var) init length: ", len(X[0]), "data (var) now length: ", len(X_new[0])
    """ 
    # PREVIOUS
    variables = []    
    for i in range (1, position):
        variables.append(vipMatrix[i][0])

    V_new = deepcopy(V)
    for i in variables:
        V_new.remove(i) ################ remove by index??? CHECK!!!!

    X_new = []
    for i in range (len(X)):
        X_new.append([])

    for i in range (len(X)):
        for j in range (len(V_new)): ####### HERE ALSO
            X_new[i].append(X[i][j])
    """
    #################################################################################################qq
    #print V_new, "OOOO\n\n" #var names == cool
    #print "\n\nNumber of variables ", len(V_new), " and latent: ", lv_best
    #best_pls = PLSCanonical(n_components = lv_best)
    best_pls = PLSRegression(n_components = lv_best)
    best_pls.fit(X_new, Y)

    saveas = pickle.dumps(best_pls)
    encoded = base64.b64encode(saveas)	
	
    return encoded, X_new, V_new, lv_best
General Linear Model -- Elastic Net
'''
clf = linear_model.ElasticNet(alpha=0.2, l1_ratio=0.01)
clf.fit(x_scaled, y_scaled)
print(clf.coef_)

yvalid_scaled = clf.predict(xvalid_scaled)

err1= MAPE(y, scalery.inverse_transform(clf.predict(x_scaled)).reshape(-1,1))
err = MAPE(yvalid, scalery.inverse_transform(yvalid_scaled).reshape(-1,1))

'''
General Linear Model -- Elastic Net
'''
from sklearn.cross_decomposition import PLSRegression
pls = PLSRegression(n_components=20)
pls.fit(x_scaled, y_scaled)
print(pls.coef_)

yvalid_scaled = pls.predict(xvalid_scaled)

err1= MAPE(y, scalery.inverse_transform(pls.predict(x_scaled)).reshape(-1,1))
err = MAPE(yvalid, scalery.inverse_transform(yvalid_scaled).reshape(-1,1))

from sklearn.decomposition import PCA
reduced_data = PCA(n_components=2).fit_transform(xtrain_minmax)

pca = PCA(n_components=2)
pca.fit(xtrain_minmax)
print(pca.explained_variance_ratio_)
def partial_least_squares(X_train, y_train, X_pred, store_settings, mod_params=None, metric=None):

   lr = PLSRegression(n_components=1, max_iter=1000, tol=1e-04)
   return lr.fit(X_train, y_train).predict(X_pred)
   return X_pred
Exemple #21
0
def plot_pcr_vs_pls():
    rng = np.random.RandomState(0)
    n_samples = 500
    cov = [[3, 3], [3, 4]]
    X = rng.multivariate_normal(mean=[0, 0], cov=cov, size=n_samples)
    pca = PCA(n_components=2).fit(X)

    plt.scatter(X[:, 0], X[:, 1], alpha=.3, label='samples')
    for i, (comp,
            var) in enumerate(zip(pca.components_, pca.explained_variance_)):
        comp = comp * var  # scale component by its variance explanation power
        plt.plot([0, comp[0]], [0, comp[1]],
                 label=f"Component {i}",
                 linewidth=5,
                 color=f"C{i + 2}")
    plt.gca().set(aspect='equal',
                  title="2-dimensional dataset with principal components",
                  xlabel='first feature',
                  ylabel='second feature')
    plt.legend()
    plt.show()

    y = X.dot(pca.components_[1]) + rng.normal(size=n_samples) / 2

    fig, axes = plt.subplots(1, 2, figsize=(10, 3))

    axes[0].scatter(X.dot(pca.components_[0]), y, alpha=.3)
    axes[0].set(xlabel='Projected data onto first PCA component', ylabel='y')
    axes[1].scatter(X.dot(pca.components_[1]), y, alpha=.3)
    axes[1].set(xlabel='Projected data onto second PCA component', ylabel='y')
    plt.tight_layout()
    plt.show()

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)

    pcr = make_pipeline(StandardScaler(), PCA(n_components=1),
                        LinearRegression())
    pcr.fit(X_train, y_train)
    pca = pcr.named_steps['pca']  # retrieve the PCA step of the pipeline

    pls = PLSRegression(n_components=1)
    pls.fit(X_train, y_train)

    fig, axes = plt.subplots(1, 2, figsize=(10, 3))
    axes[0].scatter(pca.transform(X_test),
                    y_test,
                    alpha=.3,
                    label='ground truth')
    axes[0].scatter(pca.transform(X_test),
                    pcr.predict(X_test),
                    alpha=.3,
                    label='predictions')
    axes[0].set(xlabel='Projected data onto first PCA component',
                ylabel='y',
                title='PCR / PCA')
    axes[0].legend()
    axes[1].scatter(pls.transform(X_test),
                    y_test,
                    alpha=.3,
                    label='ground truth')
    axes[1].scatter(pls.transform(X_test),
                    pls.predict(X_test),
                    alpha=.3,
                    label='predictions')
    axes[1].set(xlabel='Projected data onto first PLS component',
                ylabel='y',
                title='PLS')
    axes[1].legend()
    plt.tight_layout()
    plt.show()

    print(f"PCR r-squared {pcr.score(X_test, y_test):.3f}")
    print(f"PLS r-squared {pls.score(X_test, y_test):.3f}")

    pca_2 = make_pipeline(PCA(n_components=2), LinearRegression())
    pca_2.fit(X_train, y_train)
    print(f"PCR r-squared with 2 components {pca_2.score(X_test, y_test):.3f}")
fold_number = min(fold_number, len(autoscaled_y_train))

autoscaled_y_train = pd.Series(autoscaled_y_train)
y_test = pd.Series(y_test)
autoscaled_x_train = pd.DataFrame(autoscaled_x_train)
autoscaled_x_test = pd.DataFrame(autoscaled_x_test)

plt.rcParams['font.size'] = 18  # 横軸や縦軸の名前の文字などのフォントのサイズ
for method in regression_methods:
    print(method)
    if method == 'pls':  # Partial Least Squares
        pls_components = np.arange(1, min(np.linalg.matrix_rank(autoscaled_x_train) + 1, max_pls_component_number + 1), 1)
        r2all = list()
        r2cvall = list()
        for pls_component in pls_components:
            pls_model_in_cv = PLSRegression(n_components=pls_component)
            pls_model_in_cv.fit(autoscaled_x_train, autoscaled_y_train)
            calculated_y_in_cv = np.ndarray.flatten(pls_model_in_cv.predict(autoscaled_x_train))
            estimated_y_in_cv = np.ndarray.flatten(
                model_selection.cross_val_predict(pls_model_in_cv, autoscaled_x_train, autoscaled_y_train, cv=fold_number))
    
            """
            plt.figure(figsize=figure.figaspect(1))
            plt.scatter( y, estimated_y_in_cv)
            plt.xlabel("Actual Y")
            plt.ylabel("Calculated Y")
            plt.show()
            """
            r2all.append(float(1 - sum((autoscaled_y_train - calculated_y_in_cv) ** 2) / sum(autoscaled_y_train ** 2)))
            r2cvall.append(float(1 - sum((autoscaled_y_train - estimated_y_in_cv) ** 2) / sum(autoscaled_y_train ** 2)))
        plt.plot(pls_components, r2all, 'bo-')
Exemple #23
0
    for j in range(len(random_split)):
        test = random_split[j]
        training_list = random_split[0:j] + random_split[j +
                                                         1:len(random_split)]
        training = pd.concat(training_list)

        X_train = training.drop(
            training.columns[dsloader.RESPONSE_COLUMN_INDEX_NO_PCA], axis=1)
        Y_train = training.iloc[:,
                                dsloader.RESPONSE_COLUMN_INDEX_NO_PCA].values
        X_test = test.drop(test.columns[dsloader.RESPONSE_COLUMN_INDEX_NO_PCA],
                           axis=1)
        Y_test = label_encoder.fit_transform(
            test.iloc[:, dsloader.RESPONSE_COLUMN_INDEX_NO_PCA].apply(
                transform_numeric_to_y))
        model = PLSRegression(n_components=f_num_candidate)
        model.fit(X_train, Y_train)
        predictions = model.predict(X_test)
        predictions = label_encoder.fit_transform(
            np.array([
                transform_numeric_to_y(prediction)
                for prediction in predictions
            ]))

        current_auc += roc_auc_score(Y_test, predictions)

    current_auc /= len(random_split)
    print("step {}: {} - {}".format(i + 1, f_num_candidate, current_auc))
    results_df.iloc[0, i] = f_num_candidate
    results_df.iloc[1, i] = current_auc
    i += 1
class VM_Process2_시뮬레이터:
    metric = 0

    def __init__(self, A, d, C, F, p_lambda, p_VM, p_ACT, seed):
        self.pls = PLSRegression(n_components=6,
                                 scale=False,
                                 max_iter=50000,
                                 copy=True)
        np.random.seed(seed)
        self.A = A
        self.d = d
        self.C = C
        self.F = F
        self.p_lambda = p_lambda
        self.p_VM = p_VM
        self.p_ACT = p_ACT
        self.real_ACT = []  # Process-1을 반영하는 실제 Actual 값

    def sampling_up(self):
        # u1 = np.random.normal(0.4, np.sqrt(0.2))
        # u2 = np.random.normal(0.6, np.sqrt(0.2))
        u1 = np.random.normal(0.2, np.sqrt(0.1))
        u2 = np.random.normal(0.1, np.sqrt(0.05))
        u = np.array([u1, u2])
        return u

    def sampling_vp(self):
        v1 = np.random.normal(-0.4, np.sqrt(0.2))
        v2 = 2 * v1
        v3 = np.random.uniform(0.2, 0.6)
        v4 = 3 * v3
        v5 = np.random.uniform(0, 0.4)

        v = np.array([v1, v2, v3, v4, v5])
        return v

    def sampling_ep(self):
        e1 = np.random.normal(0, np.sqrt(0.05))
        e2 = np.random.normal(0, np.sqrt(0.1))
        e = np.array([e1, e2])
        return e

    def sampling(self,
                 k,
                 uk=np.array([0, 0]),
                 vp=np.array([0, 0, 0, 0, 0]),
                 ep=np.array([0, 0]),
                 p_VM=np.array([0, 0]),
                 p_ACT=np.array([0, 0]),
                 isInit=True):
        u1 = uk[0]
        u2 = uk[1]
        u = uk

        v1 = vp[0]
        v2 = vp[1]
        v3 = vp[2]
        v4 = vp[3]
        v5 = vp[4]

        v = vp
        e = ep

        k1 = k % 150
        k2 = k
        eta_k = np.array([[k1], [k2]])

        if isInit == True:
            e = np.array([0, 0])  #DoE는 Sampling Actual이기 때문에 e가 없다.
            fp = p_ACT  # DoE에서는 Process-1 Act 값 사용
        else:
            fp = p_VM  # VM에서는 Process-1 VM 값 사용

        psi = np.array([u1, u2, v1, v2, v3, v4, v5, k1, k2])

        if fp is not None:  # Process-1의 입력값이 있다면
            # 이건 왜 해놓은지 모르겠다.. R2R에서 결과를 도출하기 위해서인지, y를 Paremeter로 학습목적인지.. 추후 검증필요
            if k % 10 == 0:
                f = p_ACT
            else:
                f = p_VM
            if isInit == True:
                f = p_ACT
            psi = np.r_[psi, f]
            # VM이든 DoE든 계산한다.
            y = u.dot(self.A) + v.dot(self.C) + np.sum(
                eta_k * self.d, axis=0) + f.dot(self.F) + e
            if isInit == False:  #VM이 아닌 실제 ACT값을 별도로 계산한다.
                #print('f.dot(self.F) : ', f.dot(self.F), 'p_ACT.dot(self.F) : ', p_ACT.dot(self.F))
                temp = u.dot(self.A) + v.dot(self.C) + np.sum(
                    eta_k * self.d, axis=0) + p_ACT.dot(self.F) + e
                self.real_ACT.append(np.array([temp[0], temp[1]]))
        else:  # Process-1의 입력값이 없고, 향후 Process-2 VM만 하고 싶을 때
            y = u.dot(self.A) + v.dot(self.C) + np.sum(eta_k * self.d,
                                                       axis=0) + e

        rows = np.r_[psi, y]

        idx_end = len(rows)
        idx_start = idx_end - 2
        return idx_start, idx_end, rows

    def pls_update(self, V, Y):
        self.pls.fit(V, Y)
        return self.pls

    def setDoE_Mean(self, DoE_Mean):
        self.DoE_Mean = DoE_Mean

    def getDoE_Mean(self):
        return self.DoE_Mean

    def setPlsWindow(self, PlsWindow):
        self.PlsWindow = PlsWindow

    def getPlsWindow(self):
        return self.PlsWindow

    def DoE_Run(self, lamda_PLS, Z, M, f):
        N = Z * M
        DoE_Queue = []

        for k in range(1, N + 1):  # range(101) = [1, 2, ..., 120])
            if f is not None:
                fp = f[k - 1, 0:2]
            else:
                fp = None
            idx_start, idx_end, result = self.sampling(k, self.sampling_up(),
                                                       self.sampling_vp(),
                                                       self.sampling_ep(),
                                                       None, fp, True)
            DoE_Queue.append(result)

        initplsWindow = DoE_Queue.copy()
        npPlsWindow = np.array(initplsWindow)

        plsWindow = []

        # Process-1의 lamda_PLS는 이미 반영되어서 넘어오기 때문에, 중복되어 lamda_PLS를 반영할 필요가 없다.
        for z in np.arange(0, Z):
            if f is not None:
                npPlsWindow[z * M:(z + 1) * M - 1, 0:idx_start -
                            2] = lamda_PLS * npPlsWindow[z * M:(z + 1) * M - 1,
                                                         0:idx_start - 2]
                npPlsWindow[z * M:(z + 1) * M - 1, idx_start -
                            2:idx_start] = self.p_lambda * npPlsWindow[
                                z * M:(z + 1) * M - 1, idx_start - 2:idx_start]
                npPlsWindow[z * M:(z + 1) * M - 1,
                            idx_start:idx_end] = lamda_PLS * (npPlsWindow[
                                z * M:(z + 1) * M - 1, idx_start:idx_end])
            else:
                npPlsWindow[z * M:(z + 1) * M - 1,
                            0:idx_start] = lamda_PLS * npPlsWindow[
                                z * M:(z + 1) * M - 1, 0:idx_start]
                npPlsWindow[z * M:(z + 1) * M - 1,
                            idx_start:idx_end] = lamda_PLS * (npPlsWindow[
                                z * M:(z + 1) * M - 1, idx_start:idx_end])

        for i in range(len(npPlsWindow)):
            plsWindow.append(npPlsWindow[i])

        npDoE_Queue = np.array(plsWindow)
        DoE_Mean = np.mean(npDoE_Queue, axis=0)

        plsModelData = npDoE_Queue - DoE_Mean
        V0 = plsModelData[:, 0:idx_start]
        Y0 = plsModelData[:, idx_start:idx_end]

        pls = self.pls_update(V0, Y0)

        y_prd = pls.predict(V0) + DoE_Mean[idx_start:idx_end]
        y_act = npDoE_Queue[:, idx_start:idx_end]

        #print("Init DoE VM Mean squared error: %.4f" % metrics.mean_squared_error(y_act[:,1:2], y_prd[:,1:2]))
        #print("Init DoE VM r2 score: %.4f" % metrics.r2_score(y_act[:,1:2], y_prd[:,1:2]))
        #print("pls : ", pls.coef_)

        self.setDoE_Mean(DoE_Mean)
        self.setPlsWindow(plsWindow)
        # self.plt_show1(N, y_act[:,0:1], y_prd[:,0:1])

    def VM_Run(self, lamda_PLS, Z, M):
        N = Z * M

        ## V0, Y0 Mean Center
        DoE_Mean = self.getDoE_Mean()
        idx_end = len(DoE_Mean)
        idx_start = idx_end - 2
        meanVz = DoE_Mean[0:idx_start]
        meanYz = DoE_Mean[idx_start:idx_end]

        M_Queue = []
        ez_Queue = []
        mape_Queue = []
        ez_Queue.append([0, 0])
        y_act = []
        y_prd = []
        VM_Output = []
        ACT_Output = []

        plsWindow = self.getPlsWindow()

        for z in np.arange(0, Z):
            for k in np.arange(z * M + 1, ((z + 1) * M) + 1):
                if self.p_VM[k - 1] is not None:
                    idx_start, idx_end, result = self.sampling(
                        k, self.sampling_up(), self.sampling_vp(),
                        self.sampling_ep(), self.p_VM[k - 1],
                        self.p_ACT[k - 1], False)
                else:
                    idx_start, idx_end, result = self.sampling(
                        k, self.sampling_up(), self.sampling_vp(),
                        self.sampling_ep(), None, None, False)
                psiK = result[0:idx_start]
                psiKStar = psiK - meanVz
                y_predK = self.pls.predict(psiKStar.reshape(
                    1, idx_start)) + meanYz
                rows = np.r_[result, y_predK.reshape(2, )]
                M_Queue.append(rows)

                y_prd.append(rows[idx_end:idx_end + 2])
                y_act.append(rows[idx_start:idx_end])

            del plsWindow[0:M]

            ez = M_Queue[M - 1][idx_start:idx_end] - M_Queue[
                M - 1][idx_end:idx_end + 2]
            ez_Queue.append(ez)

            if z == 0:
                ez = np.array([0, 0])
            npVM_Queue = np.array(M_Queue)
            npACT_Queue = np.array(M_Queue)

            # for i in range(M):  # VM_Output 구한다. lamda_pls 가중치를 반영하지 않는다.
            #     if i == M - 1:
            #         temp = npM_Queue[i:i + 1, idx_start:idx_end]
            #     else:
            #         temp = npM_Queue[i:i + 1, idx_end:idx_end + 2]
            #     VM_Output.append(np.array([temp[0, 0], temp[0, 1]]))

            # Process-1의 lamda_PLS는 이미 반영되어서 넘어오기 때문에, 중복되어 lamda_PLS를 반영할 필요가 없다.
            if self.p_VM[z - 1] is not None:
                npVM_Queue[0:M - 1, 0:idx_start -
                           2] = lamda_PLS * npVM_Queue[0:M - 1,
                                                       0:idx_start - 2]
                npVM_Queue[0:M - 1, idx_start -
                           2:idx_start] = self.p_lambda * npVM_Queue[
                               0:M - 1, idx_start - 2:idx_start]
                npVM_Queue[0:M - 1, idx_start:idx_end] = lamda_PLS * (
                    npVM_Queue[0:M - 1, idx_end:idx_end + 2] + 0.5 * ez
                )  # + 0.5 * ez
                npVM_Queue = npVM_Queue[:, 0:idx_end]

                npACT_Queue[0:M - 1, 0:idx_start -
                            2] = lamda_PLS * npACT_Queue[0:M - 1,
                                                         0:idx_start - 2]
                npACT_Queue[0:M - 1, idx_start -
                            2:idx_start] = lamda_PLS * npACT_Queue[0:M - 1,
                                                                   idx_start -
                                                                   2:idx_start]
                npACT_Queue[0:M - 1,
                            idx_start:idx_end] = lamda_PLS * npACT_Queue[
                                0:M - 1, idx_start:idx_end]
                npACT_Queue = npACT_Queue[:, 0:
                                          idx_end]  ##idx_start ~ end 까지 VM 값 정리
            else:
                npVM_Queue[0:M - 1,
                           0:idx_start] = lamda_PLS * npVM_Queue[0:M - 1,
                                                                 0:idx_start]
                npVM_Queue[0:M - 1, idx_start:idx_end] = lamda_PLS * (
                    npVM_Queue[0:M - 1, idx_end:idx_end + 2] + 0.5 * ez
                )  # + 0.5 * ez
                npVM_Queue = npVM_Queue[:, 0:idx_end]

                npACT_Queue[0:M - 1,
                            0:idx_start] = lamda_PLS * npACT_Queue[0:M - 1,
                                                                   0:idx_start]
                npACT_Queue[0:M - 1,
                            idx_start:idx_end] = lamda_PLS * npACT_Queue[
                                0:M - 1, idx_start:idx_end]
                npACT_Queue = npACT_Queue[:, 0:
                                          idx_end]  ##idx_start ~ end 까지 VM 값 정리

            for i in range(
                    M):  #VM_Output 구한다. lamda_pls 가중치를 반영하여 다음 계산시 편리하게 한다.
                if i == M - 1:
                    temp = npACT_Queue[i:i + 1, idx_start:idx_end]
                else:
                    temp = npVM_Queue[i:i + 1, idx_start:idx_end]
                VM_Output.append(np.array([temp[0, 0], temp[0, 1]]))
                temp = npACT_Queue[i:i + 1, idx_start:idx_end]
                ACT_Output.append(np.array([temp[0, 0], temp[0, 1]]))

            for i in range(M):
                plsWindow.append(npVM_Queue[i])

            M_Mean = np.mean(plsWindow, axis=0)
            meanVz = M_Mean[0:idx_start]
            meanYz = M_Mean[idx_start:idx_end]

            plsModelData = plsWindow - M_Mean
            V = plsModelData[:, 0:idx_start]
            Y = plsModelData[:, idx_start:idx_end]

            self.pls_update(V, Y)

            del M_Queue[0:M]

        #y_act = np.array(y_act)

        y_act = np.array(self.real_ACT)
        y_prd = np.array(y_prd)
        ez_all_run = y_act - y_prd

        self.metric = metrics.explained_variance_score(y_act[:, 1:2],
                                                       y_prd[:, 1:2])
        print("VM Mean squared error: %.4f" %
              metrics.mean_squared_error(y_act[:, 1:2], y_prd[:, 1:2]))
        print("explained_variance_score: %.4f" % self.metric)
        print("VM r2 score: %.4f" %
              metrics.r2_score(y_act[:, 1:2], y_prd[:, 1:2]))
        #print("pls : ", self.pls.coef_)
        ez_run = np.array(ez_Queue)

        VM_Output = np.array(VM_Output)
        ACT_Output = np.array(ACT_Output)

        return VM_Output, ACT_Output, ez_run, y_act, y_prd, ez_all_run
import time
import pandas as pd
import matplotlib.pyplot as plt
import sys

n = sys.argv[1]
start = time.time()
with open(f"{n}.pickle", "rb") as f:
    datas = pickle.load(f)

models = {}

models["LASSO"] = LassoCV(max_iter=10000, cv=5, n_jobs=-1)
models["RIDGE"] = RidgeCV(cv=5)
models["EN"] = ElasticNetCV(max_iter=10000, cv=5, n_jobs=-1)
models["PLS20"] = PLSRegression(n_components=20, scale=False)

results = {}

for key in models.keys():
    model = models[key]
    results[key] = {"metrics": {}, "data": {}}
    for i in range(len(datas)):
        data = datas[i]
        q2 = cross_val_score(model,
                             data['train_X'],
                             data['train_Y'],
                             cv=5,
                             scoring='r2').mean()
        model.fit(data["train_X"], data["train_Y"])
        predict_Y = model.predict(data["test_X"])
Exemple #26
0
 def pls(self, x, y, param_info):
     pls = PLSRegression(n_components=param_info.pls_compnum)
     #pls = PLSRegression(n_components=param_info.pls_compnum, max_iter=1000000)
     #pls = PLSSVD(n_components=param_info.pls_compnum)
     self.learned_pls = pls.fit(x, y)
Exemple #27
0
    # scale all samples according to training set
    scaler = preprocessing.MinMaxScaler().fit(train_hydrogens)
    train_hydrogens_normalized = scaler.transform(train_hydrogens)
    test_hydrogens_normalized = scaler.transform(test_hydrogens)

    # one hot encode training labels for plsda
    train_labels_one_hot = []
    for i in np.ravel(train_labels):
        if i == 0:
            train_labels_one_hot.append([1, 0])
        else:
            train_labels_one_hot.append([0, 1])
    train_labels_one_hot = np.array(train_labels_one_hot)

    plsda = PLSRegression(n_components=30, scale=False)
    plsda.fit(train_hydrogens_normalized, train_labels_one_hot)

    test_pred_ = plsda.predict(test_hydrogens_normalized)

    test_pred = np.array([np.argmax(x) for x in test_pred_]).reshape(-1, 1)

    cm = confusion_matrix(test_labels, test_pred)

    auroc = roc_auc_score(test_labels, test_pred_[:, 1])
    auroc_folds.append(auroc)

    precision, recall, thresh = precision_recall_curve(test_labels,
                                                       test_pred_[:, 1])
    aupr = auc(recall, precision)
    aupr_folds.append(aupr)
Exemple #28
0
    def stacklearning(self):
        class sparseNorm(BaseEstimator, TransformerMixin):
            def __init__(self):
                pass

            def fit(self, X, y=None):
                return self

            def transform(self, X):
                from sklearn import preprocessing
                Y = preprocessing.normalize(sp.sparse.csc_matrix(X.values))
                return Y
        fm = sgd.FMRegression(
            n_iter=4743,
            init_stdev=0.1,
            rank=100,
            l2_reg_w=0,
            l2_reg_V=0,
            step_size=0.1,
        )
        fm = sgd.FMRegression(
            n_iter=9943,
            init_stdev=0.1,
            rank=219,
            l2_reg_w=0,
            l2_reg_V=0.06454,
            step_size=0.1,
        )
        pipe = make_pipeline(sparseNorm(), fm)
        calcACC(pipe, X=X2)

        xgb = xgboost.XGBRegressor(
                    n_estimators=100,
                    max_depth=7,
                    gamma=0,
                    colsample_bytree=0.1
                )
        lgbm = LGBMRegressor(
            boosting_type='gbdt', num_leaves=367,
            learning_rate=0.06,feature_fraction=0.14,
            max_depth=28, min_data_in_leaf=8
        )
        rgf = RGFRegressor(
            max_leaf=1211, algorithm="RGF", test_interval=100,
            loss="LS", verbose=False, l2=0.93,
            min_samples_leaf=2
        )
        rf = RandomForestRegressor(
            max_depth=20, random_state=0,
            n_estimators=56,min_samples_split=2,
            max_features=0.21
        )
        rf = RandomForestRegressor()
        ext = ExtraTreesRegressor(
            n_estimators=384,max_features= 2228,
            min_samples_split= 0.01,max_depth= 856,
            min_samples_leaf= 1
        )
        svr = SVR(
            gamma=9.5367431640625e-07,
            epsilon=0.0009765625,
            C= 2048.0
        )

        #test combination
        desNew = make_pipeline(extdescriptorNew(),rf)
        morNew = make_pipeline(extMorganNew(),rf)
        kotNew = make_pipeline(extklekotaTothNew(),rf)
        macNew = make_pipeline(extMACCSNew(),rf)

        desMac = make_pipeline(extDescriptorMACCS(),rf)
        morMac = make_pipeline(extMorganMACCS(),rf)
        kotMac = make_pipeline(extKlekotaTothMACCS(),rf)

        morKotNew = make_pipeline(extMorganKlekotaTothNew(),rf)
        des = make_pipeline(extOnlyDescriptor(),rf)
        mor = make_pipeline(extOnlyMorgan(),rf)
        kot = make_pipeline(extOnlyklekotaToth(),rf)
        mac = make_pipeline(extOnlyMACCS(),rf)
        all = make_pipeline(extAll(),rf)
        allwithoutNew = make_pipeline(extAllwithoutNew(),rf)
        allwithoutMaccs = make_pipeline(extAllwithoutMaccs(),rf)
        allwithoutDes = make_pipeline(extAllwithoutDescriptor(),rf)

        testDic = {"Desc+New":desNew,"Mor+New":morNew,"kot+New":kotNew,"MACCS+New":macNew,"Des+MAC":desMac,"Morgan+Maccs":morMac,"Kot+MACCS":kotMac,"mor+kot+New":morKotNew,
        "descriptor":des,"morgan":mor,"kot":kot,"MACCS":mac,"All":all,"All without "
                                                                      "new":allwithoutNew,
                   "All without MACCS":allwithoutMaccs,"All without Des":allwithoutDes}

        #10fold
        cv = KFold(n_splits=10, shuffle=True, random_state=0)

        #Fingerprinttest
        resultDic={}
        resultDic2={}
        for name,model in testDic.items():
            #model = StackingRegressor(regressors=[name], meta_regressor=rf,verbose=1)
            #calcACC(model,X=X,y=y2,name=name)

            Scores = cross_validate(model, X2, y2, cv=cv,scoring=myScoreFunc)
            RMSETmp = Scores['test_RMSE'].mean()
            CORRTmP = Scores['test_Correlation coefficient'].mean()
            resultDic.update({name:[RMSETmp,CORRTmP]})
            print(name,RMSETmp,CORRTmP)

        #stacking
        alldata = make_pipeline(extAll())
        # random forest
        #1.1546 0.70905
        stack = StackingRegressor(regressors=[alldata], meta_regressor=rf,verbose=1)

        # Light Gradient boosting
        # 1.160732 0.703776
        testmodel = StackingRegressor(regressors=[alldata], meta_regressor=lgbm,verbose=1)

        # XGboost
        # 1.1839805 0.689571
        testmodel = StackingRegressor(regressors=[alldata], meta_regressor=xgb,verbose=1)

        # Regularized greedily forest
        # 1.17050 0.6992
        testmodel = StackingRegressor(regressors=[alldata], meta_regressor=rgf,verbose=1)

        #pls 22.808047774809697 0.6410026452910016 i=4
        for i in np.arange(3,11,1):
            pls = PLSRegression(n_components=i)
            testmodel = StackingRegressor(regressors=[alldata], meta_regressor=pls,verbose=0)
            calcACC(testmodel)
        pls = PLSRegression(n_components=4)

        #SVR
        svr = SVR(gamma=9.5367431640625/10000000,C=1559.4918100725592,
                  epsilon=0.0009765625,)
        svr = SVR(kernel='rbf',gamma=9.5367431640625e-07,epsilon=0.0009765625,C=2048.0)

        testmodel = StackingRegressor(regressors=[alldata], meta_regressor=svr, verbose=1)
        calcACC(svr)

        #Extratree  1.157420824123527 0.7061010221224269
        testmodel = StackingRegressor(regressors=[alldata], meta_regressor=ext, verbose=1)
        calcACC(testmodel)

        #k-NN
        nbrs = KNeighborsRegressor(3)

        ##Linear regressions
        #Stochastic Gradient Descenta
        sgd = SGDRegressor(max_iter=1000)
        # Ridge
        for i in [1,10,100,1000]:
            ridge = Ridge(alpha=i)
            calcACC(ridge)
        ridge = Ridge(alpha=45.50940042350705)
        calcACC(ridge)
        # multiple linear
        lin = make_pipeline(forlinear(),LinearRegression(n_jobs=-1))
        calcACC(lin)



        #stacking
        #0.69
        testmodel = StackingRegressor(regressors=[alldata,nbrs,all], meta_regressor=rf,verbose=1)
        #1.1532 0.70926
        testmodel = StackingRegressor(regressors=[alldata,nbrs,all,xgb,lgbm,rgf], meta_regressor=rf,
                              verbose=1)
        #1.16420 0.7041
        testmodel = StackingRegressor(regressors=[alldata,alldata,all], meta_regressor=rf,verbose=1)
        #1.16379 0.7044
        stack1 = StackingRegressor(regressors=[alldata,nbrs,all,xgb,lgbm,rgf], meta_regressor=rf,verbose=1)
        testmodel  = StackingRegressor(regressors=[alldata,stack1,stack1], meta_regressor=rf,verbose=1)
        #1.1535496740699531 0.7108839199109559
        pcaFeature = make_pipeline(extPCA())
        testmodel = StackingRegressor(regressors=[pcaFeature,alldata,nbrs,rf,xgb,lgbm,rgf]
                                      ,meta_regressor=rf,verbose=1)
        #1.181801005432221 0.6889745579620922
        testmodel = StackingRegressor(regressors=[pcaFeature,alldata,nbrs,rf,xgb,lgbm,rgf]
                                      ,meta_regressor=lgbm,verbose=1)
        #0.70613
        testmodel = StackingRegressor(regressors=[pcaFeature,alldata,nbrs,rf,xgb,lgbm,rgf,ext]
                                      ,meta_regressor=xgb,verbose=1)
        #0.71641717
        testmodel = StackingRegressor(regressors=[pcaFeature,alldata,nbrs,rf,xgb,lgbm,rgf,ext]
                                      ,meta_regressor=rf,verbose=1)
        #0.7146922
        testmodel = StackingRegressor(regressors=[pcaFeature,alldata,nbrs,ridge,rf,xgb,lgbm,rgf,ext]
                                      ,meta_regressor=rf,verbose=1)

        #new features
        pcaFeature = make_pipeline(extPCA())

        #old
        pipe1 = make_pipeline(extMACCS(), rf)
        pipe2 = make_pipeline(extMorgan(), rf)
        pipe3 = make_pipeline(extDescriptor(), rf)

        pipe4 = make_pipeline(extPCA(), rgf)
        pipe7 =make_pipeline(extDescriptor(), rgf)
        pipe8 =make_pipeline(extDescriptor(), rgf)

        xgb = xgboost.XGBRegressor()
        nbrs = KNeighborsRegressor(2)
        svr = SVR(gamma='auto',kernel='linear')

        pls = PLSRegression(n_components=4)

        extMACCSdata = make_pipeline(extMACCS())

        nbrsPipe = make_pipeline(extMorgan(), nbrs)
        pipe6 = make_pipeline(extMACCS(), rgf)
        alldata = make_pipeline(extAll())
        ave = extAverage()
        withoutdesc =  make_pipeline(extMACCS())

        meta = RandomForestRegressor(max_depth=20, random_state=0, n_estimators=400)
        #stack1 = StackingRegressor(regressors=[rgf, nbrs, alldata], meta_regressor=rgf, verbose=1)

        #0.70
        stack = StackingRegressor(regressors=[pipe1,pipe2,pipe3,xgb,lgbm,rgf,rf], meta_regressor=ave, verbose=1)

        #stack2 = StackingRegressor(regressors=[stack1,nbrs, svr,pls,rgf], meta_regressor=lgbm, verbose=1)

        #0.69######################
        stack1 = StackingRegressor(regressors=[pipe1,pipe2,pipe3], meta_regressor=rf, verbose=1)
        #0.70
        stack2 = StackingRegressor(regressors=[stack1,alldata,rgf,lgbm,xgb], meta_regressor=rf,verbose=1)

        #0.71
        stack3 = StackingRegressor(regressors=[stack2,pipe1], meta_regressor=ave, verbose=1)
        ###########################
        ###########################
        stack1 = StackingRegressor(regressors=[pipe1,pipe2,pipe3], meta_regressor=rf, verbose=1)
        stack2 = StackingRegressor(regressors=[stack1,withoutdesc,lgbm,rgf], meta_regressor=rf,verbose=1)
        stack3 = StackingRegressor(regressors=[stack2,pipe1,xgb], meta_regressor=ave, verbose=1)
        ###########################

        #stackingwithknn
        stack1 = StackingRegressor(regressors=[pipe1,pipe2,pipe3], meta_regressor=rf, verbose=1)
        stack2 = StackingRegressor(regressors=[stack1,nbrs,pipe1], meta_regressor=rf, verbose=1)


        #stack3 = StackingRegressor(regressors=[rgf, nbrs, alldata], meta_regressor=ave, verbose=1)

        cv = ShuffleSplit(n_splits=10, test_size=0.1, random_state=0)
        cv = KFold(n_splits=10, shuffle=True, random_state=0)
        St1Scores = cross_validate(stack1,X,y,cv=cv)
        St1Scores['test_score'].mean()**(1/2)

        St2Scores = cross_validate(stack2,X,y,cv=cv)
        St2Scores['test_score'].mean()**(1/2)

        St3Scores = cross_validate(stack3,X,y,cv=cv)
        St3Scores['test_score'].mean()**(1/2)

        stackScore = cross_validate(stack, X, y, cv=cv)
        stackScore['test_score'].mean()**(1/2)

        lgbmScores =cross_validate(lgbm,X,y,cv=cv)
        lgbmScores['test_score'].mean()**(1/2)

        rgfScores = cross_validate(rgf,X,y,cv=cv)
        rgfScores['test_score'].mean()**(1/2)

        RFScores = cross_validate(rf,X,y,cv=cv)
        RFScores['test_score'].mean()**(1/2)

        scores = cross_validate(stack2,X,y,cv=cv)
        scores['test_score'].mean()**(1/2)
        print("R^2 Score: %0.2f (+/- %0.2f) [%s]" % (scores['test_score'].mean(), scores['test_score'].std(), 'stacking'))

        stack3.fit(X, y)
        y_pred = stack3.predict(X_train)
        y_val = stack3.predict(X_test)
        #stack3.score(X_train, y_train)
        exX = preprocess(extractDf, changeList)
        valy =  (10 **(stack3.predict(exX))).tolist()
        print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred, y_train))
        print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val, y_test))
        print('Correlation Coefficient train: %.4f' % calcCorr(y_pred, y_train))
        print('Correlation Coefficient test: %.4f' % calcCorr(y_val, y_test))

        stack1.fit(X, y)
        valy =  (10 **(stack1.predict(exX))).tolist()

        sgd.fit(X,y)
        valy =  (10 **(sgd.predict(exX))).tolist()

        rgfpipe = make_pipeline(extMACCS(), rf)
        rgf.fit(X,y)
        valy =  (10 **(rgf.predict(exX))).tolist()

        nbrs.fit(X,y)
        valy =  (10 **(nbrs.predict(exX))).tolist()

        pipe = make_pipeline(extMACCS(), rf)
        pipe.fit(X,y)
        valy =  (10 **(pipe.predict(exX))).tolist()


        rf.fit(X, y)
        y_pred = rf.predict(X_train)
        y_val = rf.predict(X_test)
        exX = preprocess(extractDf, changeList)
        valy =  (10 **(rf.predict(exX))).tolist()
        print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred, y_train))
        print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val, y_test))
        print('Correlation Coefficient train: %.4f' % calcCorr(y_pred, y_train))
        print('Correlation Coefficient test: %.4f' % calcCorr(y_val, y_test))

        lgbm.fit(X, y)
        #y_pred = pipe1.predict(X_train)
        #y_val = pipe1.predict(X_test)
        exX = preprocess(extractDf, changeList)
        valy =  (10 **(lgbm.predict(exX))).tolist()
        print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred, y_train))
        print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val, y_test))
        print('Correlation Coefficient train: %.4f' % calcCorr(y_pred, y_train))
        print('Correlation Coefficient test: %.4f' % calcCorr(y_val, y_test))
Exemple #29
0
def pls_thing(scenario_data, xcols, ycols, titlestr):
    #PLS Summary Stats
    pls = PLSRegression(n_components=3)
    pls.fit(scenario_data[xcols], scenario_data[ycols])
    k = 0
    transformed_x_full = pls.transform(scenario_data[xcols])
    y = scenario_data[ycols]

    results = pd.DataFrame(columns=('Case Label', 'Explained Variance Ratio',
                                    'RegressionCoefs', 'Regression R^2',
                                    'SpearmanCorr', 'SpearmanPvalue',
                                    'Loadings', 'X Weights', 'X Loadings',
                                    'X Scores'))

    if type(titlestr) == type([]):
        titlestr = ' '.join(titlestr)

    #Linear fits for each individual component
    for c in range(np.shape(pls.x_weights_)[1]):
        x_transformed_1pc = transformed_x_full[:, k].reshape(-1, 1)
        lr = linear_model.LinearRegression(fit_intercept=True, normalize=True)
        lr.fit(x_transformed_1pc, y)
        print('Regression Coefs', lr.coef_)
        print('R^2', lr.score(x_transformed_1pc, y))
        print('Spearman: ', scipy.stats.spearmanr(x_transformed_1pc, y))
        print('Component: ', c)
        results.loc[len(results)] = np.nan
        results.loc[len(results) - 1,
                    'Case Label'] = titlestr + ' Component ' + str(k)
        #        results.loc[len(results)-1,'Explained Variance Ratio'] = pls.explained_variance_ratio_[k]
        results.set_value(len(results) - 1, 'RegressionCoefs', lr.coef_)
        results.loc[len(results) - 1,
                    'Regression R^2'] = lr.score(x_transformed_1pc, y)
        results.loc[len(results) - 1, 'SpearmanCorr'] = scipy.stats.spearmanr(
            x_transformed_1pc, y)[0]
        results.loc[len(results) - 1,
                    'SpearmanPvalue'] = scipy.stats.spearmanr(
                        x_transformed_1pc, y)[1]
        results.set_value(len(results) - 1, 'X Weights', pls.x_weights_[:, k])
        results.set_value(
            len(results) - 1, 'X Loadings', pls.x_loadings_[:, k])
        results.set_value(len(results) - 1, 'X Scores', pls.x_scores_[:, k])

        plt.plot(x_transformed_1pc, y, '*')
        plt.xlabel('Component ' + str(k))
        plt.ylabel('Performance')
        plt.title('PLS ' + titlestr)
        plt.show()
        k += 1
        print(results)

    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.set_title("PLS PC0 vs PC1 vs Performance " + ' '.join(cs), fontsize=14)
    ax.set_xlabel("PC0", fontsize=12)
    ax.set_ylabel("PC1", fontsize=12)
    ax.scatter(transformed_x_full[:, 0],
               transformed_x_full[:, 1],
               s=100,
               c=y,
               marker='*',
               cmap=cm.bwr)
    plt.show()

    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(transformed_x_full[:, 0],
               transformed_x_full[:, 1],
               transformed_x_full[:, 2],
               s=100,
               c=y,
               marker='*',
               cmap=cm.bwr)
    ax.set_title("PLS PC0 vs PC1 vs PC2 vs Performance " + ' '.join(cs),
                 fontsize=14)
    ax.set_xlabel("PC0", fontsize=12)
    ax.set_ylabel("PC1", fontsize=12)
    ax.set_zlabel("PC2", fontsize=12)
    plt.show()

    print(results)
    return results
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
for c, i, target_name in zip("rb", target_names, target_names):
    ax.scatter(X_r[y == i, 0], X_r[y == i, 1], X_r[y == i, 2], c=c)
ax.set_xlabel('X Label')
ax.set_ylabel('Y Label')
ax.set_zlabel('Z Label')
plt.axis('equal')
ax.set_xlim([-1000,4000])
ax.set_ylim([-1000,4000])
ax.set_zlim([-1000,4000])

plt.show()

# part b
PLS1 = PLS(n_components=3)
number_map = {"M": 0,"B": 1}
numeric_y = np.array(map(lambda x : number_map[x],y))
result = PLS1.fit_transform(x,numeric_y)
X_r = result[0]
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
for c, i, target_name in zip("rb", target_names, target_names):
    ax.scatter(X_r[y == i, 0], X_r[y == i, 1], X_r[y == i, 2], c=c)
ax.set_xlabel('X Label')
ax.set_ylabel('Y Label')
ax.set_zlabel('Z Label')
plt.axis('equal')

plt.show()
Exemple #31
0
def make_plots(m,data,colors,names,groundtruth=None,waves=None,sample_size=10,ux=0,
               remove_mean=False,log_x=False,ylim=(0.3,1),res_out='',title=None):
    inds_sup_train = np.random.choice(data['X'].shape[0],size=sample_size)
    inds_sup_valid = np.random.choice(data['X_valid'].shape[0],size=sample_size)
    inds_train_x = np.random.choice(data['X_'].shape[0],size=sample_size)
    inds_train_y = np.random.choice(data['_y'].shape[0],size=sample_size)

    y = np.hstack([data['y'],1-data['y'].sum(axis=1,keepdims=True)])
    y_valid = np.hstack([data['y_valid'],1-data['y_valid'].sum(axis=1,keepdims=True)])
    y_corners = np.vstack((np.eye(data['y'].shape[1]),np.zeros(data['y'].shape[1]))).astype('float32')

    simplex = []
    for point in product(*([np.linspace(0,1,50)]*y.shape[1])):
        if np.sum(point) == 1:
            simplex += [point]
    simplex = np.asarray(simplex).astype('float32')
    simplex = simplex[:,:-1]

    if waves is None:
        waves = np.arange(data['X'].shape[1])

    if remove_mean:
        _ux = ux
    else:
        _ux = 0

    if log_x:
        f = lambda x: np.exp(x)
    else:
        f = lambda x: x

    if ylim is not None:
        force_ylim = True

    pls_XY = PLSRegression(n_components=8,scale=False)
    pls_XY.fit(data['X'],y)
    pred_train_pls = pls_XY.predict(data['X'])
    pred_train_pls = (pred_train_pls.T/np.sum(pred_train_pls,axis=1)).T
    pred_valid_pls = pls_XY.predict(data['X_valid'])
    pred_valid_pls = (pred_valid_pls.T/np.sum(pred_valid_pls,axis=1)).T
    score_pred_train_pls = KL(pred_train_pls,y)
    score_pred_valid_pls = KL(pred_valid_pls,y_valid)

    pls_YX = PLSRegression(n_components=min(8,y.shape[1]),scale=False)
    pls_YX.fit(y,data['X'])
    gen_train_pls = pls_YX.predict(y)
    gen_valid_pls = pls_YX.predict(y_valid)
    score_gen_train_pls = L2(gen_train_pls,data['X'])
    score_gen_valid_pls = L2(gen_valid_pls,data['X_valid'])

    pred_train = m.predict(x=data['X'],deterministic=True)
    pred_train = np.hstack([pred_train,1-pred_train.sum(axis=1,keepdims=True)])
    score_pred_train = KL(pred_train,y)
    pred_valid = m.predict(x=data['X_valid'],deterministic=True)
    pred_valid = np.hstack([pred_valid,1-pred_valid.sum(axis=1,keepdims=True)])
    score_pred_valid = KL(pred_valid,y_valid)

    if m.model_type in [1,2]:
        z2_train = m.getZ2(x=data['X'],y=data['y'],deterministic=True)
        z2_valid = m.getZ2(x=data['X_valid'],y=data['y_valid'],deterministic=True)
        z2_train_mean = z2_train.mean(axis=0)
        z2_valid_mean = z2_valid.mean(axis=0)
        z2_gen_train = z2_train_mean*np.ones_like(z2_train).astype('float32')
        z2_gen_valid = z2_valid_mean*np.ones_like(z2_valid).astype('float32')
        z2_gen_manifold = z2_valid_mean*np.ones((simplex.shape[0],z2_valid.shape[1])).astype('float32')
        z2_gen_endmembers = z2_train_mean*np.ones((y_corners.shape[0],z2_train.shape[1])).astype('float32')
        gen_train = f(_ux + m.generate(y=data['y'][inds_sup_train],z2=z2_gen_train[inds_sup_train],deterministic=True))  # true by default for non-variational, variational default is False
        gen_valid = f(_ux + m.generate(y=data['y_valid'][inds_sup_valid],z2=z2_gen_valid[inds_sup_valid],deterministic=True))
        manifold = f(_ux + m.generate(y=simplex,z2=z2_gen_manifold,deterministic=True))
        endmembers = f(_ux + m.generate(y=y_corners,z2=z2_gen_endmembers,deterministic=True))
        if m.variational:
            endmembers_dists = []
            for idx_c, c in enumerate(y_corners):
                endmembers_dist = [f(_ux + m.generate(y=np.atleast_2d(c),z2=z2_gen_endmembers[idx_c:idx_c+1],deterministic=False)).squeeze() for i in range(sample_size)]
                endmembers_dists += [np.asarray(endmembers_dist)]
            endmembers_dists = endmembers_dists
    else:
        gen_train = f(_ux + m.generate(y=data['y'][inds_sup_train],deterministic=True))  # true by default for non-variational, variational default is False
        gen_valid = f(_ux + m.generate(y=data['y_valid'][inds_sup_valid],deterministic=True))
        manifold = f(_ux + m.generate(y=simplex,deterministic=True))
        endmembers = f(_ux + m.generate(y=y_corners,deterministic=True))
        if m.variational:
            endmembers_dists = []
            for idx_c, c in enumerate(y_corners):
                endmembers_dist = [f(_ux + m.generate(y=np.atleast_2d(c),deterministic=False)).squeeze() for i in range(sample_size)]
                endmembers_dists += [np.asarray(endmembers_dist)]
            endmembers_dists = endmembers_dists
    recon_train = f(_ux + m.generate(x=data['X_'][inds_train_x],deterministic=True))
    recon_sup_valid = f(_ux + m.generate(x=data['X_valid'][inds_sup_valid],deterministic=True))

    fs = 24
    fs_tick = 18

    # change xticks to be names
    p = 100
    plt.plot(p*y[inds_sup_train][0],'k',lw=2,label='Ground Truth')
    ssdgm_label = 'SSDGM ({:.3f})'.format(score_pred_train)
    plt.plot(p*pred_train[inds_sup_train][0],'r-.',lw=2,label=ssdgm_label)
    pls_label = 'PLS ({:.3f})'.format(score_pred_train_pls)
    plt.plot(p*pred_train_pls[inds_sup_train][0],'b-.',lw=2,label=pls_label)
    plt.plot(p*y[inds_sup_train].T,'k',lw=2)
    plt.plot(p*pred_train[inds_sup_train].T,'r-.',lw=2)
    plt.plot(p*pred_train_pls[inds_sup_train].T,'b-.',lw=2)
    plt.title('Predicting Composition - Training Error', fontsize=fs)
    plt.ylabel('Composition (%)', fontsize=fs)
    ax = plt.gca()
    ax.set_ylim((0,1*p))
    ax.set_xticks(np.arange(y.shape[1]))
    ax.set_xticklabels(names, fontsize=fs)
    ax.tick_params(axis='x',direction='out',top='off',length=10,labelsize=fs_tick)
    lgd = plt.legend(loc='center left',bbox_to_anchor=(1, 0.5))
    ax = plt.gca()
    plt.savefig(res_out+'/comp_train.png',additional_artists=[lgd],bbox_inches='tight')
    plt.close()

    plt.plot(p*y_valid[inds_sup_valid][0],'k',lw=2,label='Ground Truth')
    ssdgm_label = 'SSDGM ({:.3f})'.format(score_pred_valid)
    plt.plot(p*pred_valid[inds_sup_valid][0],'r-.',lw=2,label=ssdgm_label)
    pls_label = 'PLS ({:.3f})'.format(score_pred_valid_pls)
    plt.plot(p*pred_valid_pls[inds_sup_valid][0],'b-.',lw=2,label=pls_label)
    plt.plot(p*y_valid[inds_sup_valid].T,'k',lw=2)
    plt.plot(p*pred_valid[inds_sup_valid].T,'r-.',lw=2)
    plt.plot(p*pred_valid_pls[inds_sup_valid].T,'b-.',lw=2)
    plt.title('Predicting Composition - Validation Error', fontsize=fs)
    plt.ylabel('Composition (%)', fontsize=fs)
    ax = plt.gca()
    ax.set_ylim((0,1*p))
    ax.set_xticks(np.arange(y.shape[1]))
    ax.set_xticklabels(names, fontsize=fs)
    ax.tick_params(axis='x',direction='out',top='off',length=10,labelsize=fs_tick)
    lgd = plt.legend(loc='center left',bbox_to_anchor=(1, 0.5))
    ax = plt.gca()
    plt.savefig(res_out+'/comp_valid.png',additional_artists=[lgd],bbox_inches='tight')
    plt.close()

    plt.plot(waves,f(_ux+data['X'][inds_sup_train]).T,'k')
    plt.plot(waves,gen_train.T,'r-.')
    plt.title('Generating Spectra - Training Error', fontsize=fs)
    plt.xlabel('Channels', fontsize=fs)
    plt.ylabel('Intensities', fontsize=fs)
    plt.tick_params(axis='both', which='major', labelsize=fs_tick)
    if force_ylim:
        plt.gca().set_ylim(ylim)
    plt.savefig(res_out+'/genspectra_train.png')
    plt.close()

    plt.plot(waves,f(_ux+data['X_valid'][inds_sup_valid]).T,'k')
    plt.plot(waves,gen_valid.T,'r-.')
    plt.title('Generating Spectra - Validation Error', fontsize=fs)
    plt.xlabel('Channels', fontsize=fs)
    plt.ylabel('Intensities', fontsize=fs)
    plt.tick_params(axis='both', which='major', labelsize=fs_tick)
    if force_ylim:
        plt.gca().set_ylim(ylim)
    plt.savefig(res_out+'/genspectra_valid.png')
    plt.close()

    if m.variational:
        for endmember, color, name in zip(endmembers,colors,names):
            plt.plot(waves,endmember,color=color,lw=2,label=name)
        for endmember_dist, color in zip(endmembers_dists,colors):
            plt.plot(waves,endmember_dist.T,'-.',color=color,lw=1)
        plt.title('Generating Endmembers with Distributions', fontsize=fs)
        plt.xlabel('Channels', fontsize=fs)
        plt.ylabel('Intensities', fontsize=fs)
        plt.tick_params(axis='both', which='major', labelsize=fs_tick)
        lgd = plt.legend(loc='center left',bbox_to_anchor=(1, 0.5))
        ax = plt.gca()
        if force_ylim:
            ax.set_ylim(ylim)
        plt.savefig(res_out+'/endmembers_dist.png',additional_artists=[lgd],bbox_inches='tight')
        plt.close()

    for endmember, color, name in zip(endmembers,colors,names):
        plt.plot(waves,endmember,color=color,lw=2,label=name)
    plt.title('Generating Endmembers', fontsize=fs)
    plt.xlabel('Channels', fontsize=fs)
    plt.ylabel('Intensities', fontsize=fs)
    plt.tick_params(axis='both', which='major', labelsize=fs_tick)
    lgd = plt.legend(loc='center left',bbox_to_anchor=(1, 0.5))
    if m.variational:
        plt.gca().set_ylim(ax.get_ylim())
    if force_ylim:
        plt.gca().set_ylim(ylim)
    plt.savefig(res_out+'/endmembers_means.png',additional_artists=[lgd],bbox_inches='tight')
    plt.close()

    for endmember, color, name in zip(endmembers,colors,names):
        plt.plot(waves,endmember,color=color,lw=2,label=name)
    for endmember, color, name in zip(groundtruth,colors,names):
        plt.plot(waves,endmember[:len(waves)],color=color,lw=6,alpha=0.4)
    score_gen_endmembers = L2(endmembers,groundtruth[:,:len(waves)])
    if title is None:
        plt.title('Generating Endmembers with Ground Truth ({:.3f})'.format(score_gen_endmembers), fontsize=fs)
    else:
        plt.title(title+' ({:.3f})'.format(score_gen_endmembers), fontsize=fs)
    plt.xlabel('Channels', fontsize=fs)
    plt.ylabel('Intensities', fontsize=fs)
    plt.tick_params(axis='both', which='major', labelsize=fs_tick)
    lgd = plt.legend(loc='lower right', fontsize=fs)
    # lgd = plt.legend(loc='center left',bbox_to_anchor=(1, 0.5))
    if m.variational:
        plt.gca().set_ylim(ax.get_ylim())
    if force_ylim:
        plt.gca().set_ylim(ylim)
    plt.savefig(res_out+'/endmembers_means_with_groundtruth.png',additional_artists=[lgd],bbox_inches='tight')
    plt.close()

    plt.plot(waves,manifold.T,color='lightgray',lw=1,alpha=0.1)
    for endmember, color, name in zip(groundtruth,colors,names):
        plt.plot(waves,endmember[:len(waves)],color=color,lw=6,alpha=1.0)
    plt.title('Spectral Manifold', fontsize=fs)
    plt.xlabel('Channels', fontsize=fs)
    plt.ylabel('Intensities', fontsize=fs)
    plt.tick_params(axis='both', which='major', labelsize=fs_tick)
    lgd = plt.legend(loc='center left',bbox_to_anchor=(1, 0.5))
    if m.variational:
        plt.gca().set_ylim(ax.get_ylim())
    if force_ylim:
        plt.gca().set_ylim(ylim)
    plt.savefig(res_out+'/manifold.png',bbox_inches='tight')
    plt.close()

    plt.plot(waves,f(_ux+data['X_'][inds_train_x]).T,'k')
    plt.plot(waves,recon_train.T,'r-.')
    plt.title('Reconstructing Spectra - Training Error', fontsize=fs)
    plt.xlabel('Channels', fontsize=fs)
    plt.ylabel('Intensities', fontsize=fs)
    plt.tick_params(axis='both', which='major', labelsize=fs_tick)
    if force_ylim:
        plt.gca().set_ylim(ylim)
    plt.savefig(res_out+'/recon_train.png')
    plt.close()

    plt.plot(waves,f(_ux+data['X_valid'][inds_sup_valid]).T,'k')
    plt.plot(waves,recon_sup_valid.T,'r-.')
    plt.title('Reconstructing Spectra - Validation Error', fontsize=fs)
    plt.xlabel('Channels', fontsize=fs)
    plt.ylabel('Intensities', fontsize=fs)
    plt.tick_params(axis='both', which='major', labelsize=fs_tick)
    if force_ylim:
        plt.gca().set_ylim(ylim)
    plt.savefig(res_out+'/recon_valid.png')
    plt.close()

    if m.model_type in [1,2]:
        # need to use vertical lines to denote edges of datasets
        # write dataset i in middle of range on xlabel
        for i in range(z2_train.shape[1]):
            plt.plot(z2_train[:,i],'r-.')
            plt.title('Nuisance Variable '+str(i)+' - Training', fontsize=fs)
            plt.tick_params(axis='both', which='major', labelsize=fs_tick)
            plt.savefig(res_out+'/nuisance_train_'+str(i)+'.png')
            plt.close()

            plt.plot(z2_valid[:,i],'r-.')
            ax = plt.gca()
            ylim = ax.get_ylim()
            # should make this general if possible
            plt.plot([1866,1866],[-5,5],'k--')
            plt.plot([1866+1742,1866+1742],[-5,5],'k--')
            # plt.plot([1866+1742+1746,1866+1742+1746],[-5,5],'k--')
            ax.set_ylim(ylim)
            plt.title('Nuisance Variable '+str(i)+' - Validation', fontsize=fs)
            plt.tick_params(axis='both', which='major', labelsize=fs_tick)
            plt.savefig(res_out+'/nuisance_valid_'+str(i)+'.png')
            plt.close()
Exemple #32
0

(Xtrain, ytrain) = loadData(xtrainpath, ytrainpath)
(Xtest, ytest) = loadData(xtestpath, ytestpath)

#trim off background and scale
ytrain=ytrain[:,1:]
#ytrain=scale(ytrain)
Xtrain=standardize(Xtrain)

#trim off background and scale
ytest = ytest[:,1:]
#ytest = scale(ytest)
Xtest = standardize(Xtest)

pls = PLSRegression(n_components=10)
pls.fit(Xtrain, ytrain)
y_pls = pls.predict(Xtest)
print 1 + pls.score(Xtest, ytest)


pls_rmse=[]
pls_rmse.append(sqrt(mean_squared_error(ytest[:,0], y_pls[:,0])))
pls_rmse.append(sqrt(mean_squared_error(ytest[:,1], y_pls[:,1])))
pls_rmse.append(sqrt(mean_squared_error(ytest[:,2], y_pls[:,2])))
pls_rmse.append(sqrt(mean_squared_error(ytest[:,3], y_pls[:,3])))

fig = plt.figure(figsize=(20,10))

ax1 = fig.add_subplot(241)
ax1.plot(y_pls[:,0], c='r', label='PLS Fit')
import os
import numpy as np
from matplotlib import pyplot as plt
from sklearn.cross_decomposition import PLSRegression
from sklearn import metrics

os.chdir("D:/01. CLASS/Machine Learning/")
pls = PLSRegression(n_components=6, scale=False, max_iter=50000, copy=True)
init_lamda_PLS = 1
lamda_PLS = 1

Tgt = np.array([0, 50])
A_p1 = np.array([[0.5, -0.2], [0.25, 0.15]])
d_p1 = np.array([[0.1, 0], [0.05, 0]])
C_p1 = np.transpose(
    np.array([[0, 0.5, 0.05, 0, 0.15, 0], [0.085, 0, 0.025, 0.2, 0, 0]]))

sample_init_EP = []
sample_vm_EP = []
sample_init_VP = []
sample_vm_VP = []

np.random.seed(1000000)  # 4

I = np.identity(2)

# L1_SC = 0.55
# L2_SC = 0.75

L1_SC = 0.45
L2_SC = 0.35
Exemple #34
0
    def __init__(self, dataset):
        pitchRoll = dataset[:, [2, 3]]
        motorPos = dataset[:, [0, 1]]

        self.polyModelLeft = PLSRegression(n_components=2)
        self.polyModelLeft.fit(pitchRoll, motorPos)
Exemple #35
0
dms = pd.get_dummies(data[['League', 'Division', 'NewLeague']])

# 准备数据
y = data['Salary']
x_ = data.drop(['Salary', 'League', 'Division', 'NewLeague'],
               axis=1).astype('float64')
x = pd.concat([x_, dms[['League_N', 'Division_W', 'NewLeague_N']]], axis=1)

# 训练集、测试集划分
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=42)

# 回归模型、参数
pls_model_setup = PLSRegression(scale=True, max_iter=5000, n_components=5)
param_grid = {'n_components': range(1, 20)}

# GridSearchCV优化参数、训练模型
gsearch = GridSearchCV(pls_model_setup, param_grid)
pls_model = gsearch.fit(x_train, y_train)

# 打印 coef
print('Partial Least Squares Regression coefficients:',
      pls_model.best_estimator_.coef_)

# 对测试集做预测
pls_prediction = pls_model.predict(x_test)

# 计算R2,均方差
pls_r2 = r2_score(y_test, pls_prediction)
def VIP(X, Y, H, NumDes):
    from sklearn.cross_decomposition import PLSRegression
    import numpy as np
    from sklearn.cross_validation import KFold
    import PCM_workflow as PW

    print('############## VIP is being processed ###############')
    M = list(X.keys())

    H_VIP, X_VIP, Y_VIP, HArray = {}, {}, {}, {}
    NumDesVIP = np.zeros((13, 6), dtype=int)
    for kk in M:
        Xtrain, Ytrain = X[kk], Y
        kf = KFold(len(Ytrain), 10, indices=True, shuffle=True, random_state=1)
        HH = H[kk]
        nrow, ncol = np.shape(Xtrain)

        ArrayYpredCV, Q2, RMSE_CV, OptimalPC = PW.CV_Processing(
            Xtrain, Ytrain, kf)

        plsmodel = PLSRegression(n_components=OptimalPC)
        plsmodel.fit(Xtrain, Ytrain)
        x_scores = plsmodel.x_scores_
        x_weighted = plsmodel.x_weights_
        m, p = nrow, ncol
        m, h = np.shape(x_scores)
        p, h = np.shape(x_weighted)
        X_S, X_W = x_scores, x_weighted

        co = []
        for i in range(h):
            corr = np.corrcoef(np.squeeze(Ytrain), X_S[:, i])
            co.append(corr[0][1]**2)
        s = sum(co)
        vip = []
        for j in range(p):
            d = []
            for k in range(h):
                d.append(co[k] * X_W[j, k]**2)
            q = sum(d)
            vip.append(np.sqrt(p * q / s))

        idx_keep = [idx for idx, val in enumerate(vip) if vip[idx] >= 1]

        idxDes = NumDes[int(kk[6:]) - 1, :]
        L, P, LxP, LxL, PxP = [], [], [], [], []
        for idx in idx_keep:
            if idx >= 0 and idx < np.sum(idxDes[0:1]):
                L.append(idx)
            elif idx >= np.sum(idxDes[0:1]) and idx < np.sum(idxDes[0:2]):
                P.append(idx)
            elif idx >= np.sum(idxDes[0:2]) and idx < np.sum(idxDes[0:3]):
                LxP.append(idx)
            elif idx >= np.sum(idxDes[0:3]) and idx < np.sum(idxDes[0:4]):
                LxL.append(idx)
            elif idx >= np.sum(idxDes[0:4]) and idx < np.sum(idxDes):
                PxP.append(idx)

        NVIP = np.array(
            [len(L),
             len(P),
             len(LxP),
             len(LxL),
             len(PxP),
             len(idx_keep)])
        NumDesVIP[int(kk[6:]) - 1, :] = NumDesVIP[int(kk[6:]) - 1, :] + NVIP

        hvip = np.array(HH)[idx_keep]
        vvip = np.array(vip)[idx_keep]
        H_VIP[kk] = hvip
        X_VIP[kk] = Xtrain[:, idx_keep]
        Y_VIP = Ytrain

        hvip = np.reshape(hvip, (len(hvip), 1))
        vvip = np.reshape(vvip, (len(vvip), 1))

        HArray[kk] = np.append(hvip, vvip, axis=1)

    return X_VIP, Y_VIP, H_VIP, HArray, NumDesVIP
Exemple #37
0
#%% PCA, SVD, PLS
from sklearn.cross_decomposition import PLSRegression
from sklearn.decomposition import PCA, TruncatedSVD

pca = PCA(n_components=8)
pca_feats = [3, 5, 10, 14, 18, 19, 22, 23, 25, 26, 27]

train_pca_df = pd.DataFrame([])
test_pca_df = pd.DataFrame([])
for feat in pca_feats:
    feat_label = "F" + str(feat)
    train_pca_df[feat_label] = train_features[feat_label]
    test_pca_df[feat_label] = test_features[feat_label]

pls = PLSRegression(n_components=8)  # This works good for the log reg model
pls.fit(train_pca_df, train_y)
train_feats_pls = pd.DataFrame(pls.transform(train_pca_df),
                               index=train_features.index)
test_feats_pls = pd.DataFrame(pls.transform(test_pca_df),
                              index=test_features.index)

#%% Replace pca feats with new feats
for feat in pca_feats:
    feat_label = "F" + str(feat)
    train_features = train_features.drop([feat_label], axis=1)
    test_features = test_features.drop([feat_label], axis=1)
train_features = pd.concat([train_features, train_feats_pls], axis=1)
test_features = pd.concat([test_features, test_feats_pls], axis=1)

#%% Logistic Regression on the initial features
#correct not accurate
from sklearn.cross_validation import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.svm import SVC
import numpy as np
import pandas as pd
from sklearn.cross_decomposition import PLSRegression
from sklearn.cross_decomposition import PLSCanonical
df=pd.read_csv('newdata.csv')
x=df.drop(['tag'],axis=1)
y=df.drop(['kx','ky','kz','wa','wb','wc','wd','we','wf'],axis=1)
X_train , X_test , Y_train , Y_test = train_test_split(x,y , random_state=5)

plsr=PLSRegression()
plsr.fit(X_train,Y_train)

plsc=PLSCanonical()
plsc.fit(X_train,Y_train)

print (plsr.score(X_test,Y_test))
print (plsc.score(X_test,Y_test))
Exemple #39
0
    def generate(self, input=None):   
        dso = input
        
        _experiment_test = self.config.get('experiment_test')
        _experiment_control = self.config.get('experiment_control')
                
        data = dso.data
        
        plsr = PLSRegression(n_components=self.config.get('number_of_components'), scale=self.config.get('autoscale')) #, algorithm=self.config.get('algorithm'))
        Y = np.array([0 if c == _experiment_control else 1 for c in dso.classes[0] ])

        plsr.fit(data, Y) # Transpose it, as vars need to along the top
        
        # Build scores into a dso no_of_samples x no_of_principal_components
        scored = DataSet(size=(len(plsr.x_scores_),len(plsr.x_scores_[0])))  
        scored.labels[0] = input.labels[0]
        scored.classes[0] = input.classes[0]

        for n,s in enumerate(plsr.x_scores_.T):
            scored.data[:,n] = s
            scored.labels[1][n] = 'Latent Variable %d' % (n+1) #, plsr.y_weights_[0][n])
                
        # PLS-DA regions; mean +- 95% confidence in each axis for each cluster
        cw_x = defaultdict(list)
        cw_y = defaultdict(list)
            
        for c in list(cw_x.keys()):
            # Calculate mean point
            cx = np.mean( cw_x[c] )
            cy = np.mean( cw_y[c] )
            
            # Calculate 95% CI
            rx = np.std( cw_x[c] ) *2 # 2sd = 95% #1.95 * ( / srn) # 1.95 * SEM => 95% confidence
            ry = np.std( cw_y[c] ) *2 #1.95 * ( / srn)

            figure_regions.append( 
                (c, cx, cy, rx, ry)
            )

        
            
        # Label up the top 50 (the values are retained; just for clarity)
        wmx = np.amax( np.absolute( plsr.x_weights_), axis=1 )
        dso_z = list(zip( dso.scales[1], dso.entities[1], dso.labels[1] ))
        dso_z = sorted( zip( dso_z, wmx ), key=lambda x: x[1])[-50:] # Top 50
        dso_z = [x for x, wmx in dso_z ]    

        weightsd = DataSet(size=plsr.x_weights_.T.shape)
        weightsd.data = plsr.x_weights_.T
        weightsd.scales[1] = input.scales[1]

        dso_lv = {}
        for n in range(0, plsr.x_weights_.shape[1] ):
            lvd =  DataSet( size=(1, input.shape[1] ) )
            lvd.entities[1] = input.entities[1]
            lvd.labels[1] = input.labels[1]
            lvd.scales[1] = input.scales[1]
            lvd.data = plsr.x_weights_[:,n:n+1].T
            dso_lv['lv%s' % (n+1)] = lvd
            weightsd.labels[0][n] = "Weights on LV %s" % (n+1)
            weightsd.classes[0][n] = "LV %s" % (n+1)
                    
        return dict(list({
            'dso': dso,
            'scores':scored,
            'weights':weightsd,
            #'figure_data': figure_data,
            #'figure_regions': figure_regions,
            'y_weights': plsr.y_weights_,
            'x_weights': plsr.x_weights_,
        }.items()) + list(dso_lv.items()) )
Exemple #40
0
        'param': {
            'n_estimators': range_t
        }
    },
    'SVR': {
        'name': 'SVR',
        'model': SVR(),
        'param': {
            'gamma': range_g,
            'C': range_c,
            'epsilon': range_e
        }
    },
    'PLS': {
        'name': 'PLS',
        'model': PLSRegression(),
        'param': {
            'n_components': range_p
        }
    },
    'GPR': {
        'name': 'GPR',
        'model': GaussianProcessRegressor(kernel=kernel),
        'param': {
            'n_restarts_optimizer': range_o
        }
    },
}

key = 'RF'  # 'RR' 'EN', 'LASSO', 'kNN', 'RF', 'GB', 'SVR', 'PLS', 'GPR'
name = model_param[key]['name']
Exemple #41
0
def run_full_caltarget_test(cal_spectra,
                            cal_labels,
                            cal_names,
                            transformed_test,
                            pls_comps=[10]):
    samples, comps = ct.load_data(norm=3, masked=True)
    org_samples = np.copy(samples)
    org_comps = np.copy(comps)

    elements = [
        'SiO2', 'TiO2', 'Al2O3', 'FeOT', 'MnO', 'MgO', 'CaO', 'Na2O', 'K2O'
    ]
    for n_comps in pls_comps:
        mars_preds = []
        ct_preds = []
        gt_preds = []
        for e, elem in enumerate(elements):
            if verbose:
                print '-----------------------'
                print elem
            for transformer in transformed_test.keys():
                print transformer
                targets = transformed_test[transformer]
                for t, target in enumerate(targets):
                    # Remove caltargets
                    if (comps['Name'] == cal_names[t]).any():
                        ind = np.argwhere(comps['Name'] == cal_names[t])[0, 0]
                        comps = np.delete(org_comps, ind, 0)
                        samples = np.delete(org_samples, ind, 0)
                    model = PLSRegression(n_components=n_comps, scale=False)
                    model.fit(samples, comps[elem])
                    lab_pred = model.predict(cal_spectra[0][t][None])
                    mars_pred = model.predict(cal_spectra[1][t][None])
                    trans_pred = model.predict(target)
                    gt = cal_labels[t, e]
                    score = (norm(mars_pred - gt, ord=1) -
                             norm(trans_pred - gt, ord=1))
                    mars_preds.append(mars_pred[0][0])
                    ct_preds.append(trans_pred[0][0])
                    gt_preds.append(gt)
                    if verbose:
                        print cal_names[t]
                        print 'Ground truth: %.4f' % gt
                        print 'Lab target: %.4f' % lab_pred
                        print 'Mars target: %.4f' % mars_pred
                        print 'Transformed Mars: %.4f' % trans_pred
                        print 'Score: %.4f' % score
                        print
        pred_shape = (len(elements), len(targets))
        ct_preds = np.array(ct_preds).reshape(pred_shape)
        gt_preds = np.array(gt_preds).reshape(pred_shape)
        mars_preds = np.array(mars_preds).reshape(pred_shape)
        print '-----------------------'
        print "Element\tMars\t\tCalTran\t\t%Gain/lost"
        for i, e in enumerate(elements):
            mars_rmsep = rmse(gt_preds[i, :], mars_preds[i, :])
            ct_rmsep = rmse(gt_preds[i, :], ct_preds[i, :])
            print e,
            print "\t%f" % round(mars_rmsep, 4),
            print "\t%f" % round(ct_rmsep, 4),
            print "\t%f" % round((mars_rmsep - ct_rmsep) * 100 / mars_rmsep, 4)
        print '-----------------------'
        print "Sample\tMars\t\tCalTran\t\t%Gain/lost"
        for i, n in enumerate(names):
            mars_rmsep = rmse(gt_preds[:, i], mars_preds[:, i])
            ct_rmsep = rmse(gt_preds[:, i], ct_preds[:, i])
            print n,
            print "\t%f" % round(mars_rmsep, 4),
            print "\t%f" % round(ct_rmsep, 4),
            print "\t%f" % round((mars_rmsep - ct_rmsep) * 100 / mars_rmsep, 4)
def plsvip (X, Y, V, lat_var):
    attributes = len(X[0])

    if not lat_var:
        latent_variables = attributes
    else:
        latent_variables = lat_var
		
    num_instances = len(X)	
	
    attributes_gone = []

    min_att = -1	

    #start_time = time.time()
    #attr_time = time.time()
    #time_counter = 0
    while attributes>0: 
        #if (attributes +9) %10 ==0:
        #    print "total time: ", time.time() - start_time
        #    print "attr time: ", time.time() - attr_time
        #    attr_time = time.time()

        if (latent_variables == 0) or (latent_variables > attributes):	
            latent_variables = attributes	

        lv_best = best_latent_variable(X, Y, latent_variables, num_instances)
        #print "current best lv: ", lv_best, "num. attr. ", attributes ####
		
        #fin_pls = PLSCanonical(n_components = lv_best)
        fin_pls = PLSRegression(n_components = lv_best)
        fin_pls.fit(X, Y)


        currentR2 = fin_pls.score(X, Y)  

        #######################################w
        # alternative r2
        """
        meanY4r2 = numpy.mean(Y)
        predY = fin_pls.predict(X)
        RSS = 0
        for i in range (len(Y)):
            RSS +=  numpy.power (Y[i] - predY[i], 2)
        TSS = 0
        for i in range (len(Y)):
            TSS += numpy.power (Y[i] - meanY4r2, 2)
        
        alterR2 = 1 - (RSS/TSS)
        #print currentR2, "vs", alterR2
        """
        #######################################w
        
        min_vip = 1000

        if min_att ==-1:
            attributes_gone.append(["None", currentR2, attributes, lv_best])

        ##########################################r
        #threaded version
        """ 
        myThreads = []
        VIPcurrent = []
        for i in range (0,attributes):
            myThreads.append(enthread( target = get_vip, args = (fin_pls, lv_best, i, attributes_gone, attributes  )) )
        for i in range (0,attributes):
            VIPcurrent.append(myThreads[i].get())
      
        min_vip = min(VIPcurrent)
        min_att = VIPcurrent.index(min_vip)
        """ 
        # Working version
        #"""
        for i in range (0,attributes):
            VIPcurrent = get_vip (fin_pls, lv_best, i, attributes_gone, attributes  )
            if VIPcurrent< min_vip:
                min_vip = VIPcurrent
                min_att = i
        #"""
        ##########################################r
        if min_att >-1:
            attributes_gone.append([V[min_att], currentR2, attributes, lv_best]) ####### CURRENT : to BE popped, NOT already popped
        V.pop(min_att)

        for i in range (num_instances):
            X[i].pop(min_att)

        attributes -= 1		
    #print attributes_gone ####
    #time_counter +=1
    return attributes_gone
    n_components = 75 #X_train[0].shape[1]

    for vid,Xt,yt in zip(subjId_val, X_val, y_val):
	levelOneTest = []
	levelOneTrain = []
	X_levelOne = []
	y_levelOne = []	
	level0Classifier = []
        for tid,Xp,yp in zip(subjId_train,X_train,y_train):
	    print "Predicting subject ", vid, "from subject ", tid
            y0 = np.zeros(yp.shape)
	    y1 = np.ones(Xt.shape[0])
	    X = np.vstack([Xp,Xt])
            yd = np.concatenate([y0,y1])

            pls = PLSRegression(n_components)
	    Xp_t, Xp_v, yp_t, yp_v = tts(Xp.copy(),yp.copy(),train_size=0.9)
	    yp_t = yp_t.astype(bool)
	    yp_t_not =  np.vstack((yp_t,~yp_t)).T
	    #print "yp_t_not ", yp_t_not.shape
	    pls.fit(Xp_t,yp_t_not.astype(int))
	    yp_new = pls.predict(Xp_t, copy=True)
	    yp_pred = (yp_new[:,0] > yp_new[:,1]).astype(int)
	    yp_t = yp_t.astype(int)
	    #print y_new,y_pred, y_t
	    error = ((yp_t - yp_pred) ** 2).sum()
   	    print "PLS Training error " , float(error)/yp_t.shape[0]
 	    yp_new = pls.predict(Xp_v, copy=True)
	    yp_pred = (yp_new[:,0] > yp_new[:,1]).astype(int)
	    #print y_new, y_pred, y_v
	    #print ((y_v - y_pred) ** 2).sum(), y_v.shape[0]
plt.xticks(())
plt.yticks(())
plt.show()

# #############################################################################
# PLS regression, with multivariate response, a.k.a. PLS2

n = 1000
q = 3
p = 10
X = np.random.normal(size=n * p).reshape((n, p))
B = np.array([[1, 2] + [0] * (p - 2)] * q).T
# each Yj = 1*X1 + 2*X2 + noize
Y = np.dot(X, B) + np.random.normal(size=n * q).reshape((n, q)) + 5

pls2 = PLSRegression(n_components=3)
pls2.fit(X, Y)
print("True B (such that: Y = XB + Err)")
print(B)
# compare pls2.coef_ with B
print("Estimated B")
print(np.round(pls2.coef_, 1))
pls2.predict(X)

# PLS regression, with univariate response, a.k.a. PLS1

n = 1000
p = 10
X = np.random.normal(size=n * p).reshape((n, p))
y = X[:, 0] + 2 * X[:, 1] + np.random.normal(size=n * 1) + 5
pls1 = PLSRegression(n_components=3)
Exemple #45
0
def main():
    args = parser.parse_args()

    # 模型选择及输入参数
    model_name = {
        1: 'PLSR',
        2: 'LS-SVR',
        3: 'GPR',
        4: 'FCN',
        5: 'LSTM',
        6: 'GCN',
        7: 'MC-GCN',
        8: 'GC-LSTM'
    }
    print(model_name)
    model_select = list(input('Select models:'))

    # 初始化结果
    results = {
        'adj': [],
        'r2': [],
        'rmse': [],
        'loss_hist': [],
        'prediction': []
    }
    # os.mkdir('Results')
    f = open('Results/params.txt', 'w+')
    f.write('Parameters setting:\n{}\n\n'.format(args.__dict__))

    # 导入数据
    data = pd.read_excel('8号机磨煤机C_正常.xlsx',
                         index_col=0,
                         header=1,
                         nrows=args.length + 5001)
    data = data.iloc[5001:, :]

    # 数据划分
    predict_variable = [3, 12, 15, 20, 23]
    y = data.iloc[:, predict_variable]
    X = data.drop(columns=y.columns)
    X_train, y_train = X.iloc[:int(args.length * args.train_size
                                   )], y.iloc[:int(args.length *
                                                   args.train_size)]
    X_test, y_test = X.iloc[int(args.length * args.train_size
                                ):], y.iloc[int(args.length *
                                                args.train_size):]

    # 导出数据
    # X_train.to_csv('Results/X_train.csv', header=False, index=False)
    # X_test.to_csv('Results/X_test.csv', header=False, index=False)
    # y_train.to_csv('Results/y_train.csv', header=False, index=False)
    # y_test.to_csv('Results/y_test.csv', header=False, index=False)

    # 设定种子
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    # 多次实验
    for exp in range(args.n_exp):
        print('=====Experiment({}/{})====='.format(exp + 1, args.n_exp))
        f.write('=====Experiment({}/{})=====\n'.format(exp + 1, args.n_exp))
        results['adj'].append({})
        results['r2'].append({})
        results['rmse'].append({})
        results['loss_hist'].append({})
        results['prediction'].append({})

        # PLSR
        if '1' in model_select:
            flag = 1
            print('====={}====='.format(model_name[flag]))
            f.write('====={}=====\n'.format(model_name[flag]))

            # 训练&测试
            t1 = time.time()
            reg = PLSRegression(args.n_components).fit(X_train, y_train)
            t2 = time.time()
            y_pred = reg.predict(X_test)
            t3 = time.time()
            y_fit = reg.predict(X_train)
            print(reg.get_params())
            print('Time:\nFit: {:.3f}s Pred: {:.3f}s'.format(t2 - t1, t3 - t2))
            print('R2:\nFit: {} Pred: {}'.format(
                r2_score(y_train, y_fit, multioutput='raw_values'),
                r2_score(y_test, y_pred, multioutput='raw_values')))

            # 写入文件
            f.write(str(reg.get_params()) + '\n')
            f.write('Time:\nFit: {:.3f}s Pred: {:.3f}s\n'.format(
                t2 - t1, t3 - t2))
            f.write('R2:\nFit: {} Pred: {}\n'.format(
                r2_score(y_train, y_fit, multioutput='raw_values'),
                r2_score(y_test, y_pred, multioutput='raw_values')))

            # 存储结果和模型
            index = r2_rmse(y_test, y_pred, y.columns, f)
            results['r2'][-1].update({model_name[flag]: index[0]})
            results['rmse'][-1].update({model_name[flag]: index[1]})
            results['prediction'][-1].update({model_name[flag]: y_pred})
            joblib.dump(
                reg, 'Results/{}-{}.model'.format(model_name[flag], exp + 1))

        # LS-SVR
        if '2' in model_select:
            flag = 2
            print('====={}====='.format(model_name[flag]))
            f.write('====={}=====\n'.format(model_name[flag]))

            # 训练&测试
            t1 = time.time()
            reg = LssvrModel(args.c, args.sigma).fit(X_train, y_train)
            t2 = time.time()
            y_pred = reg.predict(X_test)
            t3 = time.time()
            y_fit = reg.predict(X_train)
            print(reg.get_params())
            print('Time:\nFit: {:.3f}s Pred: {:.3f}s'.format(t2 - t1, t3 - t2))
            print('R2:\nFit: {} Pred: {}'.format(
                r2_score(y_train, y_fit, multioutput='raw_values'),
                r2_score(y_test, y_pred, multioutput='raw_values')))

            # 写入文件
            f.write(str(reg.get_params()) + '\n')
            f.write('Time:\nFit: {:.3f}s Pred: {:.3f}s\n'.format(
                t2 - t1, t3 - t2))
            f.write('R2:\nFit: {} Pred: {}\n'.format(
                r2_score(y_train, y_fit, multioutput='raw_values'),
                r2_score(y_test, y_pred, multioutput='raw_values')))

            # 存储结果和模型
            index = r2_rmse(y_test, y_pred, y.columns, f)
            results['r2'][-1].update({model_name[flag]: index[0]})
            results['rmse'][-1].update({model_name[flag]: index[1]})
            results['prediction'][-1].update({model_name[flag]: y_pred})
            joblib.dump(
                reg, 'Results/{}-{}.model'.format(model_name[flag], exp + 1))

        # GPR
        if '3' in model_select:
            flag = 3
            print('====={}====='.format(model_name[flag]))
            f.write('====={}=====\n'.format(model_name[flag]))

            # 训练&测试
            t1 = time.time()
            kernel = DotProduct() * RBF(args.length_scale,
                                        (args.length_scale, args.length_scale))
            reg = GaussianProcessRegressor(kernel=kernel,
                                           alpha=args.alpha).fit(
                                               X_train, y_train)
            t2 = time.time()
            y_pred = reg.predict(X_test)
            t3 = time.time()
            y_fit = reg.predict(X_train)
            print(reg.get_params())
            print('Time:\nFit: {:.3f}s Pred: {:.3f}s'.format(t2 - t1, t3 - t2))
            print('R2:\nFit: {} Pred: {}'.format(
                r2_score(y_train, y_fit, multioutput='raw_values'),
                r2_score(y_test, y_pred, multioutput='raw_values')))

            # 写入文件
            f.write(str(reg.get_params()) + '\n')
            f.write('Time:\nFit: {:.3f}s Pred: {:.3f}s\n'.format(
                t2 - t1, t3 - t2))
            f.write('R2:\nFit: {} Pred: {}\n'.format(
                r2_score(y_train, y_fit, multioutput='raw_values'),
                r2_score(y_test, y_pred, multioutput='raw_values')))

            # 存储结果和模型
            index = r2_rmse(y_test, y_pred, y.columns, f)
            results['r2'][-1].update({model_name[flag]: index[0]})
            results['rmse'][-1].update({model_name[flag]: index[1]})
            results['prediction'][-1].update({model_name[flag]: y_pred})
            joblib.dump(
                reg, 'Results/{}-{}.model'.format(model_name[flag], exp + 1))

        # FCN
        if '4' in model_select:
            flag = 4
            print('====={}====='.format(model_name[flag]))
            f.write('====={}=====\n'.format(model_name[flag]))

            # 训练&测试
            t1 = time.time()
            reg = FcnModel(X_train.shape[1], y_train.shape[1],
                           (1024, 256, 256, 256), args.n_epoch,
                           args.batch_size, args.lr, args.weight_decay,
                           args.step_size, args.gamma).fit(X_train, y_train)
            t2 = time.time()
            y_pred = reg.predict(X_test)
            t3 = time.time()
            y_fit = reg.predict(X_train)
            print(reg.get_params())
            print('Time:\nFit: {:.3f}s Pred: {:.3f}s'.format(t2 - t1, t3 - t2))
            print('R2:\nFit: {} Pred: {}'.format(
                r2_score(y_train, y_fit, multioutput='raw_values'),
                r2_score(y_test, y_pred, multioutput='raw_values')))

            # 写入文件
            f.write(str(reg.get_params()) + '\n')
            f.write('Time:\nFit: {:.3f}s Pred: {:.3f}s\n'.format(
                t2 - t1, t3 - t2))
            f.write('R2:\nFit: {} Pred: {}\n'.format(
                r2_score(y_train, y_fit, multioutput='raw_values'),
                r2_score(y_test, y_pred, multioutput='raw_values')))

            # 存储结果和模型
            index = r2_rmse(y_test, y_pred, y.columns, f)
            results['r2'][-1].update({model_name[flag]: index[0]})
            results['rmse'][-1].update({model_name[flag]: index[1]})
            results['prediction'][-1].update({model_name[flag]: y_pred})
            results['loss_hist'][-1].update({model_name[flag]: reg.loss_hist})
            joblib.dump(
                reg, 'Results/{}-{}.model'.format(model_name[flag], exp + 1))

        # LSTM
        if '5' in model_select:
            flag = 5
            print('====={}====='.format(model_name[flag]))
            f.write('====={}=====\n'.format(model_name[flag]))

            # 训练&测试
            t1 = time.time()
            reg = LstmModel(X_train.shape[1], y_train.shape[1], (1024, ),
                            (256, 256, 256), args.seq_len, args.n_epoch,
                            args.batch_size, args.lr, args.weight_decay,
                            args.step_size, args.gamma).fit(X_train, y_train)
            t2 = time.time()
            y_pred = reg.predict(X_test)
            t3 = time.time()
            y_fit = reg.predict(X_train)
            print(reg.get_params())
            print('Time:\nFit: {:.3f}s Pred: {:.3f}s'.format(t2 - t1, t3 - t2))
            print('R2:\nFit: {} Pred: {}'.format(
                r2_score(y_train, y_fit, multioutput='raw_values'),
                r2_score(y_test, y_pred, multioutput='raw_values')))

            # 写入文件
            f.write(str(reg.get_params()) + '\n')
            f.write('Time:\nFit: {:.3f}s Pred: {:.3f}s\n'.format(
                t2 - t1, t3 - t2))
            f.write('R2:\nFit: {} Pred: {}\n'.format(
                r2_score(y_train, y_fit, multioutput='raw_values'),
                r2_score(y_test, y_pred, multioutput='raw_values')))

            # 存储结果和模型
            index = r2_rmse(y_test, y_pred, y.columns, f)
            results['r2'][-1].update({model_name[flag]: index[0]})
            results['rmse'][-1].update({model_name[flag]: index[1]})
            results['prediction'][-1].update({model_name[flag]: y_pred})
            results['loss_hist'][-1].update({model_name[flag]: reg.loss_hist})
            joblib.dump(
                reg, 'Results/{}-{}.model'.format(model_name[flag], exp + 1))

        # GCN
        if '6' in model_select:
            flag = 6
            print('====={}====='.format(model_name[flag]))
            f.write('====={}=====\n'.format(model_name[flag]))

            # 训练&测试
            t1 = time.time()
            reg = GcnModel(X_train.shape[1], y_train.shape[1], (1024, ),
                           (256, 256, 256), args.graph_reg, args.self_con,
                           args.n_epoch, args.batch_size, args.lr,
                           args.weight_decay, args.step_size,
                           args.gamma).fit(X_train, y_train)
            t2 = time.time()
            y_pred = reg.predict(X_test)
            t3 = time.time()
            y_fit = reg.predict(X_train)
            print(reg.get_params())
            print('Time:\nFit: {:.3f}s Pred: {:.3f}s'.format(t2 - t1, t3 - t2))
            print('R2:\nFit: {} Pred: {}'.format(
                r2_score(y_train, y_fit, multioutput='raw_values'),
                r2_score(y_test, y_pred, multioutput='raw_values')))

            # 写入文件
            f.write(str(reg.get_params()) + '\n')
            f.write('Time:\nFit: {:.3f}s Pred: {:.3f}s\n'.format(
                t2 - t1, t3 - t2))
            f.write('R2:\nFit: {} Pred: {}\n'.format(
                r2_score(y_train, y_fit, multioutput='raw_values'),
                r2_score(y_test, y_pred, multioutput='raw_values')))

            # 存储结果和模型
            index = r2_rmse(y_test, y_pred, y.columns, f)
            results['r2'][-1].update({model_name[flag]: index[0]})
            results['rmse'][-1].update({model_name[flag]: index[1]})
            results['prediction'][-1].update({model_name[flag]: y_pred})
            results['loss_hist'][-1].update({model_name[flag]: reg.loss_hist})
            joblib.dump(
                reg, 'Results/{}-{}.model'.format(model_name[flag], exp + 1))

        # MC-GCN
        if '7' in model_select:
            flag = 7
            print('====={}====='.format(model_name[flag]))
            f.write('====={}=====\n'.format(model_name[flag]))

            # 训练&测试
            t1 = time.time()
            reg = McgcnModel(X_train.shape[1], (1024, ), (256, ), (256, 256),
                             y_train.shape[1], args.graph_reg, args.self_con,
                             args.n_epoch, args.batch_size, args.lr,
                             args.weight_decay, args.step_size,
                             args.gamma).fit(X_train, y_train)
            t2 = time.time()
            y_pred = reg.predict(X_test)
            t3 = time.time()
            y_fit = reg.predict(X_train)
            print(reg.get_params())
            print('Time:\nFit: {:.3f}s Pred: {:.3f}s'.format(t2 - t1, t3 - t2))
            print('R2:\nFit: {} Pred: {}'.format(
                r2_score(y_train, y_fit, multioutput='raw_values'),
                r2_score(y_test, y_pred, multioutput='raw_values')))

            # 写入文件
            f.write(str(reg.get_params()) + '\n')
            f.write('Time:\nFit: {:.3f}s Pred: {:.3f}s\n'.format(
                t2 - t1, t3 - t2))
            f.write('R2:\nFit: {} Pred: {}\n'.format(
                r2_score(y_train, y_fit, multioutput='raw_values'),
                r2_score(y_test, y_pred, multioutput='raw_values')))

            # 存储结果和模型
            index = r2_rmse(y_test, y_pred, y.columns, f)
            results['adj'][-1].update({model_name[flag]: reg.adj})
            results['r2'][-1].update({model_name[flag]: index[0]})
            results['rmse'][-1].update({model_name[flag]: index[1]})
            results['prediction'][-1].update({model_name[flag]: y_pred})
            results['loss_hist'][-1].update({model_name[flag]: reg.loss_hist})
            joblib.dump(
                reg, 'Results/{}-{}.model'.format(model_name[flag], exp + 1))

        # GC-LSTM
        if '8' in model_select:
            flag = 8
            print('====={}====='.format(model_name[flag]))
            f.write('====={}=====\n'.format(model_name[flag]))

            # 训练&测试
            t1 = time.time()
            reg = GclstmModel(X_train.shape[1], (1024, ), (256, ), (256, 256),
                              y_train.shape[1], args.seq_len, args.graph_reg,
                              args.self_con, args.n_epoch, args.batch_size,
                              args.lr, args.weight_decay, args.step_size,
                              args.gamma).fit(X_train, y_train)
            t2 = time.time()
            y_pred = reg.predict(X_test)
            t3 = time.time()
            y_fit = reg.predict(X_train)
            print(reg.get_params())
            print('Time:\nFit: {:.3f}s Pred: {:.3f}s'.format(t2 - t1, t3 - t2))
            print('R2:\nFit: {} Pred: {}'.format(
                r2_score(y_train, y_fit, multioutput='raw_values'),
                r2_score(y_test, y_pred, multioutput='raw_values')))

            # 写入文件
            f.write(str(reg.get_params()) + '\n')
            f.write('Time:\nFit: {:.3f}s Pred: {:.3f}s\n'.format(
                t2 - t1, t3 - t2))
            f.write('R2:\nFit: {} Pred: {}\n'.format(
                r2_score(y_train, y_fit, multioutput='raw_values'),
                r2_score(y_test, y_pred, multioutput='raw_values')))

            # 存储结果和模型
            index = r2_rmse(y_test, y_pred, y.columns, f)
            results['adj'][-1].update({model_name[flag]: reg.adj})
            results['r2'][-1].update({model_name[flag]: index[0]})
            results['rmse'][-1].update({model_name[flag]: index[1]})
            results['prediction'][-1].update({model_name[flag]: y_pred})
            results['loss_hist'][-1].update({model_name[flag]: reg.loss_hist})
            joblib.dump(
                reg, 'Results/{}-{}.model'.format(model_name[flag], exp + 1))

    # 存储结果
    np.save('Results/results.npy', results)
    f.close()
    if method_name[0:3] == 'jit':
        nn_model = NearestNeighbors(metric='euclidean')  # サンプル選択用の k-NN モデルの宣言

# オートスケーリング    
autoscaled_x_train = (x_train - x_train.mean()) / x_train.std()
autoscaled_y_train = (y_train - y_train.mean()) / y_train.std()
autoscaled_x_test = (x_test - x_train.mean()) / x_train.std()

# ハイパーパラメータの最適化やモデリング
if method_name == 'pls' or method_name == 'mwpls':
    # CV による成分数の最適化
    components = []  # 空の list の変数を作成して、成分数をこの変数に追加していきます同じく成分数をこの変数に追加
    r2_in_cv_all = []  # 空の list の変数を作成して、成分数ごとのクロスバリデーション後の r2 をこの変数に追加
    for component in range(1, min(np.linalg.matrix_rank(autoscaled_x_train), max_number_of_principal_components) + 1):
        # PLS
        model = PLSRegression(n_components=component)  # PLS モデルの宣言
        estimated_y_in_cv = pd.DataFrame(cross_val_predict(model, autoscaled_x_train, autoscaled_y_train,
                                                           cv=fold_number))  # クロスバリデーション推定値の計算し、DataFrame型に変換
        estimated_y_in_cv = estimated_y_in_cv * y_train.std() + y_train.mean()  # スケールをもとに戻す
        r2_in_cv = metrics.r2_score(y_train, estimated_y_in_cv)  # r2 を計算
        print(component, r2_in_cv)  # 成分数と r2 を表示
        r2_in_cv_all.append(r2_in_cv)  # r2 を追加
        components.append(component)  # 成分数を追加
    optimal_component_number = components[r2_in_cv_all.index(max(r2_in_cv_all))]  # 最適成分数
    # PLS
    model = PLSRegression(n_components=optimal_component_number)  # モデルの宣言
    model.fit(autoscaled_x_train, autoscaled_y_train)  # モデルの構築
elif method_name == 'svr' or method_name == 'mwsvr' or method_name == 'jitsvr':
    # グラム行列の分散を最大化することによる γ の最適化
    variance_of_gram_matrix = list()
    for svr_gamma in svr_gammas:
        mean_absolute_error(y_test, y_pred_test)
    ]
    output.append(temp)
output = pd.DataFrame(output,
                      columns=[
                          'alpha', 'Train_R2', 'Train_MSE', 'Train_MAE',
                          'Test_R2', 'Test_MSE', 'Test_MAE'
                      ])
output.to_csv('ElasticNet.csv', index=False)
plots(y_test, y_pred_test)

#model 2:PLS
from sklearn.cross_decomposition import PLSRegression
output = []
for i in range(1, 12, 2):
    pls = PLSRegression(n_components=i, max_iter=10000)
    pls.fit(X_train_std, y_train)
    y_pred_train = pls.predict(X_train_std)
    y_pred_test = pls.predict(X_test_std)
    temp = [
        i,
        pls.score(X_train_std, y_train),
        mean_squared_error(y_train, y_pred_train),
        mean_absolute_error(y_train, y_pred_train),
        pls.score(X_test_std, y_test),
        mean_squared_error(y_test, y_pred_test),
        mean_absolute_error(y_test, y_pred_test)
    ]
    output.append(temp)
output = pd.DataFrame(output,
                      columns=[
Exemple #48
0
def pls_ds(A, B, n_components=1):
    model = PLSRegression(n_components=n_components, scale=False).fit(B, A)
    return model.coefs, model.predict(B)
    for i in range (5):
        plt.plot(nComponents,plsCanScores[i,:],lw=3)

    plt.xlim(1,np.amax(nComponents))
    plt.title('PLS Cannonical accuracy')
    plt.xlabel('Number of components')
    plt.ylabel('accuracy')
    plt.legend (['LR','LDA','GNB','Linear SVM','rbf SVM'],loc='lower right')
    plt.grid(True)

if (0):
    #%% PLS Regression
    nComponents = np.arange(1,nClasses+1)
    plsRegScores = np.zeros((5,np.alen(nComponents)))
    for i,n in enumerate(nComponents):
        plsReg = PLSRegression(n_components=n)
        plsReg.fit(Xtrain,Ytrain)
        XtrainT = plsReg.transform(Xtrain)
        XtestT = plsReg.transform(Xtest)
        plsRegScores[:,i] = util.classify(XtrainT,XtestT,labelsTrain,labelsTest)

    
    plsReg = PLSRegression(n_components=2)
    plsReg.fit(Xtrain,Ytrain)
    xt = plsReg.transform(Xtrain)
    fig = plt.figure()
    util.plotData(fig,xt,labelsTrain,classColors)
    plt.title('First 2 components of projected data')
    

    #%% Plot accuracies for PLSSVD 
Exemple #50
0
from sklearn.cross_decomposition import PLSRegression

import pandas as pd
import numpy as np

_experiment_test = config['experiment_test']
_experiment_control = config['experiment_control']

plsr = PLSRegression(n_components=config['number_of_components'], scale=config['autoscale']) #, algorithm=self.config.get('algorithm'))

# We need classes to do the classification; should check and raise an error
class_idx = input_data.index.names.index('Class')
classes = list( input_data.index.levels[ class_idx ] )

Y = input_data.index.labels[ class_idx ]

plsr.fit(input_data.values, Y)

# Build scores into a dso no_of_samples x no_of_principal_components
scores = pd.DataFrame(plsr.x_scores_)  
scores.index = input_data.index

scoresl =[]
for n,s in enumerate(plsr.x_scores_.T):
    scoresl.append( 'Latent Variable %d' % (n+1) ) #, plsr.y_weights_[0][n]) 
scores.columns = scoresl
    

weights = pd.DataFrame( plsr.x_weights_.T )
weights.columns = input_data.columns
Exemple #51
0
pcr_opt.fit(college_train_x, college_train_y)
reduced_college_test_x = pcr_opt.transform(college_test_x)
reduced_college_train_x = pcr_opt.transform(college_train_x)
lrm = LinearRegression()
lrm.fit(reduced_college_train_x, college_train_y)
print "\nPCR RMSE (M = " + str(opt_m) + ")"
print rmse(lrm, reduced_college_test_x, college_test_y)

#%% PLS
from sklearn.cross_decomposition import PLSRegression

pls_components = range(1, 18)

cv_pls = np.array([])
for m in pls_components:
    pls = PLSRegression(n_components=m)
    foo = np.transpose(college_train_x.get_values())
    transformed_college_train_x = pls.fit_transform(college_train_x,
                                                    college_train_y)[0]
    lrm = LinearRegression()
    pls_this_rmse = rmse_cv(LinearRegression(), transformed_college_train_x,
                            college_train_y).mean()
    cv_pls = np.append(cv_pls, pls_this_rmse)

min_m = pls_components[np.argmin(cv_pls)]
cv_pls = pd.Series(cv_pls, index=pls_components)
cv_pls.plot(title="PLSRegression Cross Validation")
plt.xlabel("Number of Components (M)")
plt.ylabel("Root Mean Square Error")
if show_plots_flag:
    plt.show()
train = pd.read_csv('train.csv', index_col='id')
targets = pd.get_dummies(train.target)
train.drop('target', axis=1, inplace=True)
train = train.apply(np.log1p)

test = pd.read_csv('test.csv', index_col='id')
test = test.apply(np.log1p)

Xt, Xv, yt, yv = train_test_split(train, targets, test_size=0.2, random_state=27)

best = 10.

for n in range(5,16):
    
    clf = PLSRegression(n_components=n)
    clf.fit(Xt,yt)
    y_pred = clf.predict(Xv)
    loss = multiclass_log_loss(np.argmax(y_pred,axis=1),y_pred)
    if loss < best:
        n_best = n
        best = loss
        postfix = '(*)'
    else:
        postfix = ''
    print ('comps: {:02d}\tLoss:{:5.4f} {}'.format(n,loss,postfix))


clf = PLSRegression(n_components=n_best)  
clf.fit(train,targets)
y_pred = clf.predict(test)
Exemple #53
0
class PLS_DA(object):
    '''
	'''
    def __init__(self, n_comps=3, yIsDummyMatrix=False, scaleData=False):
        '''
		data contains n_samples, n_features
		Y - response
		'''
        self.comps = n_comps
        self.yIsDummyMatrix = yIsDummyMatrix
        self.plsr = PLSRegression(n_components=n_comps, scale=scaleData)

    def fit(self, X, Y):

        if self.yIsDummyMatrix:
            self.Ym = Y
        else:
            self.Ym = self.create_dummy_y(Y)
        if self.evaluate_data(X, self.Ym):
            self.plsr.fit(X, self.Ym)

    def get_scores(self, block='x'):
        '''
		'''
        if block == 'x':
            return self.plsr.x_scores_
        elif block == 'y':
            return self.plsr.y_scores_

    def get_weights(self, block='x'):
        '''
		'''
        if block == 'x':
            return self.plsr.x_weights_

    def get_loadings(self, block='x'):

        if block == 'x':
            return self.plsr.x_loadings_

    def get_squared_r(self, X, Y):
        '''
		'''
        return self.plsr.score(X, Y)

    def get_classes(self):
        '''
		'''

    def get_dummy_Y(self):
        '''
		'''
        return self.Ym

    def evaluate_data(self, X, Y):
        '''
		'''
        if X.shape[0] != Y.shape[0]:
            print("Number of rows in X does not equal number of rows in Y")
            return False
        else:
            return True

    def create_dummy_y(self, Y):
        '''
		'''
        uniqueVals = np.unique(Y)
        nClasses = uniqueVals.size
        Ydummy = np.zeros((Y.shape[0], nClasses))
        for n, target in enumerate(Y):
            col = np.where(uniqueVals == target)
            Ydummy[n, col] = 1
        self.classOrder = uniqueVals.tolist()
        return Ydummy
    # sfs_plot(sfs.get_metric_dict(), kind='std_dev')
    return sfs_fit


def find_maxfit(data):
    max_score = 100
    print(data["training scores"])
    return max(data["test scores"]), max(data["training scores"])


if __name__ == "__main__":
    x, y = get_data.get_data("mango", "as7262", int_time=150,
                             position=2,
                             led_current="25 mA")

    pls = PLSRegression(n_components=6)
    y = y['Total Chlorophyll (µg/cm2)']
    # x, y = get_data.get_data("mango", "as7262", int_time=150,
    #                          position=2, led="b'White'",
    #                          led_current="25 mA")
    # print(x.shape)
    # # pls_screen_as726x(x, y, n_comps=10)
    # print(type(x))
    poly = PolynomialFeatures(degree=1)
    x_trans = poly.fit_transform(x)
    # pls.fit(x_trans, y)
    # y_predict = pls.predict(x_trans)
    # print(mean_absolute_error(y, y_predict))
    # ham
    # n_comps = 6
    # regr = PLSRegression(n_components=n_comps)
Exemple #55
0
class Local_FWC_Simulator:
    def __init__(self, A, d, C, seed):
        self.pls = PLSRegression(n_components=6,
                                 scale=False,
                                 max_iter=50000,
                                 copy=True)
        np.random.seed(seed)
        self.A = A
        self.d = d
        self.C = C

    def sampling_up(self):
        u1 = np.random.normal(0.4, np.sqrt(0.2))
        u2 = np.random.normal(0.6, np.sqrt(0.2))
        u = np.array([u1, u2])
        return u

    def sampling_vp(self):
        v1 = np.random.normal(1, np.sqrt(0.2))
        v2 = 2 * v1
        v3 = np.random.uniform(0.2, 1.2)
        v4 = 3 * v3
        v5 = np.random.uniform(0, 0.4)
        v6 = np.random.normal(-0.6, np.sqrt(0.2))

        v = np.array([v1, v2, v3, v4, v5, v6])
        return v

    def sampling_ep(self):
        e1 = np.random.normal(0, np.sqrt(0.1))
        e2 = np.random.normal(0, np.sqrt(0.2))
        e = np.array([e1, e2])
        return e

    def sampling(self,
                 k,
                 uk=np.array([0, 0]),
                 vp=np.array([0, 0, 0, 0, 0, 0]),
                 ep=np.array([0, 0]),
                 isInit=True):
        u1 = uk[0]
        u2 = uk[1]
        u = uk

        v1 = vp[0]
        v2 = vp[1]
        v3 = vp[2]
        v4 = vp[3]
        v5 = vp[4]
        v6 = vp[5]

        v = vp
        e = ep

        if isInit == True:
            k1 = k % 100
            k2 = k % 200
            e = np.array([0, 0])
        else:
            k1 = k % 100  # n = 100 일 때 #1 entity maintenance event
            k2 = k % 200  # n = 200 일 때 #1 entity maintenance event
        eta_k = np.array([[k1], [k2]])

        psi = np.array([u1, u2, v1, v2, v3, v4, v5, v6, k1, k2])
        y = u.dot(self.A) + v.dot(self.C) + np.sum(eta_k * self.d, axis=0) + e
        rows = np.r_[psi, y]
        idx_end = len(rows)
        idx_start = idx_end - 2
        return idx_start, idx_end, rows

    def pls_update(self, V, Y):
        self.pls.fit(V, Y)
        return self.pls

    def setDoE_Mean(self, DoE_Mean):
        self.DoE_Mean = DoE_Mean

    def getDoE_Mean(self):
        return self.DoE_Mean

    def setPlsWindow(self, PlsWindow):
        self.PlsWindow = PlsWindow

    def getPlsWindow(self):
        return self.PlsWindow

    def plt_show1(self, n, y_act, y_prd):
        plt.plot(np.arange(n), y_act, 'rx--', y_prd, 'bx--', lw=2, ms=5, mew=2)
        plt.xticks(np.arange(0, n + 1, 50))
        plt.xlabel('Run No.')
        plt.ylabel('Actual and Predicted Response (y1)')

    def plt_show2(self, n, y1, y2):
        plt.figure()
        plt.plot(np.arange(n), y1, 'bx-', y2, 'gx--', lw=2, ms=5, mew=2)
        plt.xticks(np.arange(0, n + 1, 5))
        plt.yticks(np.arange(-1.2, 1.3, 0.2))
        plt.xlabel('Metrology Run No.(z)')
        plt.ylabel('e(z)')

    def DoE_Run(self, Z, M):
        N = Z * M
        DoE_Queue = []

        for k in range(1, N + 1):  # range(101) = [0, 1, 2, ..., 100])
            idx_start, idx_end, result = self.sampling(k, self.sampling_up(),
                                                       self.sampling_vp(),
                                                       self.sampling_ep(),
                                                       True)
            DoE_Queue.append(result)

        initplsWindow = DoE_Queue.copy()
        npPlsWindow = np.array(initplsWindow)

        plsWindow = []

        for i in range(len(npPlsWindow)):
            plsWindow.append(npPlsWindow[i])

        npDoE_Queue = np.array(plsWindow)
        DoE_Mean = np.mean(npDoE_Queue, axis=0)

        plsModelData = npDoE_Queue - DoE_Mean
        V0 = plsModelData[:, 0:idx_start]
        Y0 = plsModelData[:, idx_start:idx_end]

        pls = self.pls_update(V0, Y0)

        print('Init VM Coefficients: \n', pls.coef_)

        y_prd = pls.predict(V0) + DoE_Mean[idx_start:idx_end]
        y_act = npDoE_Queue[:, idx_start:idx_end]

        print("Init DoE VM Mean squared error: %.3f" %
              metrics.mean_squared_error(y_act[:, 0:1], y_prd[:, 0:1]))
        print("Init DoE VM r2 score: %.3f" %
              metrics.r2_score(y_act[:, 0:1], y_prd[:, 0:1]))

        self.setDoE_Mean(DoE_Mean)
        self.setPlsWindow(plsWindow)
        # self.plt_show1(N, y_act[:,0:1], y_prd[:,0:1])
    def VM_Run(self, lamda_PLS, Z, M):
        N = Z * M

        ## V0, Y0 Mean Center
        DoE_Mean = self.getDoE_Mean()
        idx_end = len(DoE_Mean)
        idx_start = idx_end - 2
        meanVz = DoE_Mean[0:idx_start]
        meanYz = DoE_Mean[idx_start:idx_end]

        M_Queue = []
        ez_Queue = []
        ez_Queue.append([0, 0])
        y_act = []
        y_prd = []

        plsWindow = self.getPlsWindow()

        for z in np.arange(0, Z):
            for k in np.arange(z * M + 1, ((z + 1) * M) + 1):
                idx_start, idx_end, result = self.sampling(
                    k, self.sampling_up(), self.sampling_vp(),
                    self.sampling_ep(), False)
                psiK = result[0:idx_start]
                psiKStar = psiK - meanVz
                y_predK = self.pls.predict(psiKStar.reshape(
                    1, idx_start)) + meanYz
                rows = np.r_[result, y_predK.reshape(2, )]
                M_Queue.append(rows)

                y_prd.append(rows[idx_end:idx_end + 2])
                y_act.append(rows[idx_start:idx_end])

            del plsWindow[0:M]

            ez = M_Queue[M - 1][idx_start:idx_end] - M_Queue[
                M - 1][idx_end:idx_end + 2]
            print("ez : ", ez)
            ez_Queue.append(ez)

            if z == 0:
                ez = np.array([0, 0])
            npM_Queue = np.array(M_Queue)
            npM_Queue[0:M - 1,
                      0:idx_start] = lamda_PLS * npM_Queue[0:M - 1,
                                                           0:idx_start]
            npM_Queue[0:M - 1, idx_start:idx_end] = lamda_PLS * (
                npM_Queue[0:M - 1, idx_end:idx_end + 2] + 0.5 * ez)
            npM_Queue = npM_Queue[:, 0:idx_end]

            for i in range(M):
                plsWindow.append(npM_Queue[i])

            M_Mean = np.mean(plsWindow, axis=0)
            meanVz = M_Mean[0:idx_start]
            meanYz = M_Mean[idx_start:idx_end]

            plsModelData = plsWindow - M_Mean
            V = plsModelData[:, 0:idx_start]
            Y = plsModelData[:, idx_start:idx_end]

            self.pls_update(V, Y)

            del M_Queue[0:M]

        y_act = np.array(y_act)
        y_prd = np.array(y_prd)

        self.plt_show1(N, y_act[:, 0:1], y_prd[:, 0:1])

        print("VM Mean squared error: %.3f" %
              metrics.mean_squared_error(y_act[:, 0:1], y_prd[:, 0:1]))
        print("VM r2 score: %.3f" %
              metrics.r2_score(y_act[:, 0:1], y_prd[:, 0:1]))
        ez_run = np.array(ez_Queue)
        self.plt_show2(Z + 1, ez_run[:, 0:1], ez_run[:, 1:2])
    plt.bar(np.arange(np.shape(X_train_prepro)[1]), pca_wild_b.components_[i])
    if i == 0:
        plt.ylabel('1st component')
    elif i == 1:
        plt.ylabel('2nd component')
    else:
        plt.ylabel('3rd component')
    axis_c = plt.gca()
    axis_c.set_xticklabels(wild_boar_ddbb['header'][3:],fontsize = 7)
    axis_c.set_xticks(axis_c.get_xticks() + 0.5)
    print "dentro del bucleeeeeeeeeee"

#Select the number of components using CV
#%%
##PLSR
pls_wild_b = PLSRegression(n_components = 3)
pls_wild_b.fit(X_train_prepro,Y_train)
X_train_pls_proj = pls_wild_b.transform(X_train_prepro)
print("loadings")

for i in range(pls_wild_b.n_components):
    plt.figure()
    plt.bar(np.arange(np.shape(X_train_prepro)[1]), pls_wild_b.x_loadings_[:,i])
    if i == 0:
        plt.ylabel('PLS 1st component')
    elif i == 1:
        plt.ylabel('PLS2nd component')
    else:
        plt.ylabel('PLS 3rd component')
    axis_c = plt.gca()
    axis_c.set_xticklabels(wild_boar_ddbb['header'][3:],fontsize = 7)
class Global_FWC_P3_Simulator:

    def __init__(self, Tgt, A, d, C, F, seed):
        self.pls = PLSRegression(n_components=6, scale=False, max_iter=50000, copy=True)
        np.random.seed(seed)
        self.Tgt = Tgt
        self.A = A
        self.d = d
        self.C = C
        self.F = F

    def sampling_vp(self):
        v1 = np.random.normal(-0.4, np.sqrt(0.2))
        v2 = 2 * v1
        v3 = np.random.uniform(0.2, 0.6)
        v4 = 3 * v3
        v5 = np.random.uniform(0, 0.4)

        v = np.array([v1, v2, v3, v4, v5])
        return v

    def sampling_ep(self):
        e1 = np.random.normal(0, np.sqrt(0.05))
        e2 = np.random.normal(0, np.sqrt(0.1))
        e = np.array([e1, e2])
        return e

    def sampling(self, k, uk=np.array([0, 0]), vp=np.array([0, 0, 0, 0, 0]), ep=np.array([0, 0]), fp=np.array([0, 0]), isInit=True):
        u1 = uk[0]
        u2 = uk[1]
        u = uk

        v1 = vp[0]
        v2 = vp[1]
        v3 = vp[2]
        v4 = vp[3]
        v5 = vp[4]

        v = vp
        e = ep

        k1 = k
        k2 = k
        eta_k = np.array([[k1], [k2]])

        psi = np.array([u1, u2, v1, v2, v3, v4, v5, k1, k2])

        if fp is not None:
            psi = np.r_[psi, fp]
            f = fp
            y = u.dot(self.A) + v.dot(self.C) + np.sum(eta_k * self.d, axis=0) + f.dot(self.F) + e
        else:
            y = u.dot(self.A) + v.dot(self.C) + np.sum(eta_k * self.d, axis=0) + e

        rows = np.r_[psi, y]

        idx_end = len(rows)
        idx_start = idx_end - 2
        return idx_start, idx_end, rows  #y값의 시작과 끝 정보, 전체 값 정보

    def pls_update(self, V, Y):
        self.pls.fit(V, Y)
        return self.pls

    def setDoE_Mean(self, DoE_Mean):
        self.DoE_Mean = DoE_Mean

    def getDoE_Mean(self):
        return self.DoE_Mean

    def setPlsWindow(self, PlsWindow):
        self.PlsWindow = PlsWindow

    def getPlsWindow(self):
        return self.PlsWindow

    def plt_show1(self, n, y_act):
        plt.figure()
        plt.plot(np.arange(1, n + 1), y_act, 'bx--', lw=2, ms=10, mew=2)
        plt.xticks(np.arange(0, n + 1, 20))
        plt.xlabel('Run No.')
        plt.ylabel('Actual Response (y2)')

    def plt_show2(self, n, y_act):
        plt.plot(np.arange(1, n + 1), y_act, 'ro-', lw=2, ms=5, mew=2)
        plt.xticks(np.arange(0, n + 1, 20))
        plt.xlabel('Run No.')
        plt.ylabel('Actual Response (y2)')

    def DoE_Run(self, lamda_PLS, dEWMA_Wgt1, dEWMA_Wgt2, Z, M, f, isR2R):
        N = Z * M
        I = np.identity(2)
        dEWMA_Wgt1 = dEWMA_Wgt1 * I
        dEWMA_Wgt2 = dEWMA_Wgt2 * I
        DoE_Queue = []

        sample_init_VP = []
        sample_init_EP = []

        for k in range(0, N + 1):
            sample_init_VP.append(self.sampling_vp())
            sample_init_EP.append(self.sampling_ep())
        vp_next = sample_init_VP[0]
        ep_next = sample_init_EP[0]
        ep_next = np.array([0, 0])

        for k in range(1, N + 1):  # range(101) = [0, 1, 2, ..., 100])
            if f is not None:
                fp = f[k - 1,0:2]
                p1_lamda_PLS = f[k - 1, 2:3]
                fp = p1_lamda_PLS * fp
                if k == 1:
                    uk_next = np.array([-51, -102])  # 계산 공식에 의해
                    Dk_prev = np.array([-0.24, 26.4])
                    Kd_prev = np.array([0.024, 0.07])
                    #Dk_prev = np.array([-0.2, 20])
                    #Kd_prev = np.array([-0.02, 1])

            else:
                fp = None
                if k == 1:
                    uk_next = np.array([0, 0])  # 계산 공식에 의해
                    Dk_prev = np.array([0, 0])
                    Kd_prev = np.array([0, 0])

            idx_start, idx_end, result = self.sampling(k, uk_next, vp_next, ep_next, fp, True)
            npResult = np.array(result)

            #================================== initVM-R2R Control =====================================
            uk = npResult[0:2]
            yk = npResult[idx_start:idx_end]

            Dk = (yk - uk.dot(self.A)).dot(dEWMA_Wgt1) + Dk_prev.dot(I - dEWMA_Wgt1)
            Kd = (yk - uk.dot(self.A) - Dk_prev).dot(dEWMA_Wgt2) + Kd_prev.dot(I - dEWMA_Wgt2)
            # Dk = (yk - uk.dot(self.A) - fp.dot(self.F)).dot(dEWMA_Wgt1) + Dk_prev.dot(I - dEWMA_Wgt1)
            # Kd = (yk - uk.dot(self.A) - fp.dot(self.F) - Dk_prev).dot(dEWMA_Wgt2) + Kd_prev.dot(I - dEWMA_Wgt2)

            Kd_prev = Kd
            Dk_prev = Dk

            if isR2R == True:
                uk_next = (self.Tgt - Dk - Kd).dot(np.linalg.inv(self.A))
                vp_next = sample_init_VP[k]

            else:
                if k % M == 0:
                    uk_next = (self.Tgt - Dk - Kd).dot(np.linalg.inv(self.A))
                    vp_next = sample_init_VP[k]
            ep_next = sample_init_EP[k]
            ep_next = np.array([0, 0])
            DoE_Queue.append(result)

        initplsWindow = DoE_Queue.copy()
        npPlsWindow = np.array(initplsWindow)

        plsWindow = []

        #np.savetxt("output/npPlsWindow1.csv", npPlsWindow, delimiter=",", fmt="%s")

        if f is not None:
            for k in range(0, N):  # range(101) = [0, 1, 2, ..., 100])
                p1_lamda_PLS = f[k, 2:3]
                if (k + 1) % M != 0:
                    npPlsWindow[k, idx_start - 2:idx_start] = p1_lamda_PLS * npPlsWindow[k, idx_start - 2:idx_start]

        for z in np.arange(0, Z):
            npPlsWindow[z * M:(z + 1) * M - 1, 0:idx_start] = lamda_PLS * npPlsWindow[z * M:(z + 1) * M - 1, 0:idx_start]
            npPlsWindow[z * M:(z + 1) * M - 1, idx_start:idx_end] = lamda_PLS * (npPlsWindow[z * M:(z + 1) * M - 1, idx_start:idx_end])

        for i in range(len(npPlsWindow)):
            plsWindow.append(npPlsWindow[i])

        #np.savetxt("output/npPlsWindow2.csv", npPlsWindow, delimiter=",", fmt="%s")

        npDoE_Queue = np.array(plsWindow)
        DoE_Mean = np.mean(npDoE_Queue, axis=0)

        plsModelData = npDoE_Queue - DoE_Mean
        V0 = plsModelData[:, 0:idx_start]
        Y0 = plsModelData[:, idx_start:idx_end]

        pls = self.pls_update(V0, Y0)

        # print('Init VM Coefficients: \n', pls.coef_)
        y_pred = pls.predict(V0) + DoE_Mean[idx_start:idx_end]
        y_act = npDoE_Queue[:, idx_start:idx_end]

        self.setDoE_Mean(DoE_Mean)
        self.setPlsWindow(plsWindow)
        # self.plt_show2(N, y_act[:, 1:2])
        # self.plt_show1(N, y_pred[:, 1:2])


    def VM_Run(self, lamda_PLS, dEWMA_Wgt1, dEWMA_Wgt2, Z, M, f, isR2R):
        N = Z * M
        I = np.identity(2)
        dEWMA_Wgt1 = dEWMA_Wgt1 * I
        dEWMA_Wgt2 = dEWMA_Wgt2 * I

        ## V0, Y0 Mean Center
        DoE_Mean = self.getDoE_Mean()
        idx_end = len(DoE_Mean)
        idx_start = idx_end - 2
        meanVz = DoE_Mean[0:idx_start]
        meanYz = DoE_Mean[idx_start:idx_end]
        yk = np.array([0, 0])

        Dk_prev = np.array([-0.24, 26.4])     #10번째 run시 값
        Kd_prev = np.array([0.024, 0.07])   #10번째 run시 값

        # Dk = np.array([0, 0])
        # Kd = np.array([0, 0])

        uk_next = np.array([-51, -102]) #계산 공식에 의해

        M_Queue = []
        ez_Queue = []
        ez_Queue.append([0, 0])
        y_act = []
        y_pred = []
        VM_Output = []

        plsWindow = self.getPlsWindow()

        sample_vm_VP = []
        sample_vm_EP = []

        for k in range(0, N + 1):
            sample_vm_VP.append(self.sampling_vp())
            sample_vm_EP.append(self.sampling_ep())
        vp_next = sample_vm_VP[0]
        ep_next = sample_vm_EP[0]

        for z in np.arange(0, Z):
            for k in np.arange(z * M + 1, ((z + 1) * M) + 1):
                if f is not None:
                    fp = f[k - 1, 0:2]
                else:
                    fp = None
                    if k == 1:
                        uk_next = np.array([0, 0])  # 계산 공식에 의해
                        Dk_prev = np.array([0, 0])
                        Kd_prev = np.array([0, 0])

                # y값의 시작과 끝 정보, 전체 값 정보
                idx_start, idx_end, result = self.sampling(k, uk_next, vp_next, ep_next, fp, False)
                psiK = result[0:idx_start]  # 파라미터 값들
                psiKStar = psiK - meanVz  # 파라미터 값들 평균 마이너스
                y_predK = self.pls.predict(psiKStar.reshape(1, idx_start)) + meanYz   # 예측값 + 평균
                rows = np.r_[result, y_predK.reshape(2, )]   #실제값 + 2개 예측값을 rows로, run 10일때가 actual, vm 차이 비교

                y_pred.append(rows[idx_end:idx_end + 2])  #예측 값  ==> 10개의 VM 값인데..
                y_act.append(rows[idx_start:idx_end])     #실제 값   ==> 시뮬레이션의 실제 값 인데..

                # ================================== VM + R2R Control =====================================
                if k % M != 0:   #예측 값
                    yk = rows[idx_end:idx_end + 2]
                else:
                    yk = rows[idx_start:idx_end]    #실제 값
                    e1 = np.absolute(rows[idx_start + 1:idx_end] - rows[idx_end + 1:idx_end + 2])
                uk = psiK[0:2]

                Dk = (yk - uk.dot(self.A)).dot(dEWMA_Wgt1) + Dk_prev.dot(I - dEWMA_Wgt1)
                Kd = (yk - uk.dot(self.A) - Dk_prev).dot(dEWMA_Wgt2) + Kd_prev.dot(I - dEWMA_Wgt2)

                Kd_prev = Kd
                Dk_prev = Dk

                if isR2R == True:
                    uk_next = (self.Tgt - Dk - Kd).dot(np.linalg.inv(self.A))
                    vp_next = sample_vm_VP[k]

                uk_next = uk_next.reshape(2, )
                ep_next = sample_vm_EP[k]

                M_Queue.append(rows)  # M_Queue에 rows의 정보

            del plsWindow[0:M]   #Queue의 가장 처음 Run 10이 없어진다.

            if isR2R == False:
                uk_next = (self.Tgt - Dk - Kd).dot(np.linalg.inv(self.A))
                vp_next = sample_vm_VP[k]

            # 여기서 부터는 모델 업데이트를 위한 과정이다. 이미 VM은 rows 정보에 있지만, 가중치를 반영해 준다.

            if z == 0:
                ez = 0
            npM_Queue = np.array(M_Queue)  #parameter + 실제값 + 2개 예측값을 rows로, run 10일 때가 actual, vm 차이 비교
            # M은 Run 주기이며, 10, M-1은 run = 10을 제외한 VM들이겠지
            # idx_start 까지는 파라미터 값들로 lamda_PLS 0.1을 반영하겠다는 의미지..

            for i in range(M):  #VM_Output 구한다. lamda_pls 가중치를 반영하지 않는다.
                if i == M - 1:
                    temp = npM_Queue[i:i + 1, idx_start:idx_end]
                else:
                    temp = npM_Queue[i:i + 1, idx_end:idx_end + 2]
                VM_Output.append(np.array([temp[0, 0], temp[0, 1]]))

            # emax = 5
            # lamda_PLS = 1 - e1/emax
            # if lamda_PLS <= 0:
            #     lamda_PLS = 0.1
            #
            # print("e1 : ", e1, "P2 lamda_PLS : ", lamda_PLS)

            if f is not None:
                p1_lamda_PLS = f[k - 1, 2:3]
                npM_Queue[0:M - 1, idx_start - 2:idx_start] = p1_lamda_PLS * npM_Queue[0:M - 1, idx_start - 2:idx_start]

            #np.savetxt("output/npM_Queue2.csv", npM_Queue, delimiter=",", fmt="%s")

            npM_Queue[0:M - 1, 0:idx_start] = lamda_PLS * npM_Queue[0:M - 1, 0:idx_start]
            # idx_start:idx_end는 실제 값에 VM 값들의 조정을 통해 모델을 위해 VM의 정보를 업데이트 한다.
            npM_Queue[0:M - 1, idx_start:idx_end] = lamda_PLS * (npM_Queue[0:M - 1, idx_end:idx_end + 2] + 0.5 * ez)
            #npM_Queue[0:M - 1, idx_start:idx_end] = lamda_PLS * (npM_Queue[0:M - 1, idx_end:idx_end + 2])  # 0.5 * ez 반영안할시
            npM_Queue = npM_Queue[:, 0:idx_end] #여기에는 VM + Actual 실제값들이 저장되어 있다.

            # for i in range(M):  #VM_Output 구한다. lamda_pls 가중치를 반영하지 않는다.
            #     temp = npM_Queue[i:i + 1, idx_start:idx_end]
            #     VM_Output.append(np.array([temp[0, 0], temp[0, 1]]))

            for i in range(M):
                plsWindow.append(npM_Queue[i])  #전체 Queue에 넣는다.

            M_Mean = np.mean(plsWindow, axis=0)  #Queue의 평균을 구한다.
            meanVz = M_Mean[0:idx_start]   #파라미터 평균
            meanYz = M_Mean[idx_start:idx_end]  #y값 run시마다 vm 9개(lamda_pla 0.1) 실제 1개(lamda_pls 1) 평균

            plsModelData = plsWindow - M_Mean   #Queue의 평균 제외
            V = plsModelData[:, 0:idx_start]    #모델을 위한 파라미터
            Y = plsModelData[:, idx_start:idx_end]   #모델을 위한 y값

            self.pls_update(V, Y)
            ez = M_Queue[M - 1][idx_start:idx_end] - M_Queue[M - 1][idx_end:idx_end + 2]
            ez_Queue.append(ez)
            # print("ez : ", ez)

            del M_Queue[0:M]

        y_act = np.array(y_act)
        y_pred = np.array(y_pred)

        # print("VM Mean squared error: %.3f" % metrics.mean_squared_error(y_act[:,1:2], y_pred[:,1:2]))
        # print("VM r2 score: %.3f" % metrics.r2_score(y_act[:,1:2], y_pred[:,1:2]))
        return y_act[:, 1:2]
Exemple #58
0
X = dataset["data"]
y = dataset["target"]

# Center each feature and scale the variance to be unitary
X = preprocessing.scale(X)

# Compute the variance for each column
print(numpy.var(X, 0).sum())

# Now use PCA using 3 components
pca = PCA(3)
X2 = pca.fit_transform(X)
print(numpy.var(X2, 0).sum())

pls = PLSRegression(3)
pls.fit(X, y)
X2 = pls.transform(X)
print(numpy.var(X2, 0).sum())

# Make predictions using an SVM with PCA and PLS
pca_error = 0
pls_error = 0
n_folds = 10

svc = LinearSVC()

for train_inds, test_inds in KFold(X.shape[0], n_folds=n_folds):
    X_train, X_test = X[train_inds], X[test_inds]
    y_train, y_test = y[train_inds], y[test_inds]
Exemple #59
0
def get_best_estimator():
    """Hyperparameter optimization"""

    df = load_data()
    Y = df['Fitness']
    X = df[['Variants']]
    features = FeatureUnion([
        #('one_hot_encoder', OneHotEncoder()),
        #('one_hot_pair_encoder', OneHotPairEncoder()),
        #('pybiomed_encoder', PyBioMedEncoder()),
        ('aaindex_encoder', AAIndexEncoder())
    ])
    print('*' * 40)
    print('Extracting features...')
    print('*' * 40)
    start = timer()
    X = features.transform(X)
    end = timer()
    print('Finished in: {}'.format(end - start))
    num_rows, num_cols = X.shape
    assert num_rows == len(df)
    print('Got {} features'.format(num_cols))

    # TODO include this in pipeilne

    imp = SimpleImputer(missing_values=np.nan, strategy='mean')
    imp.fit(X)
    X = imp.transform(X)
    assert not pd.DataFrame(X).isna().any().any()

    X = FFTEncoder().fit_transform(X)

    import ipdb
    ipdb.set_trace()

    n_features_options = [int(num_cols * ratio) for ratio in N_FEATURES_RATIOS]
    print('n_features_options:', n_features_options)
    feature_reduction_grid = [
        {
            'reduce': [
                #PCA(),
                NMF()
            ],
            'reduce__n_components': n_features_options,
        },
        #{
        #    'reduce': [SelectKBest()],
        #    'reduce__score_func': [
        #        f_regression,
        #        mutual_info_regression
        #    ],
        #    'reduce__k': n_features_options,
        #},
    ]

    # Random forest features
    # Number of trees
    n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=10)]
    # Number of features to consider at every split
    max_features = ['auto', 'sqrt']
    # Maximum number of levels in tree
    max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
    max_depth.append(None)
    # Minimum number of samples required to split a node
    min_samples_split = [2, 5, 10]
    # Minimum number of samples required at each leaf node
    min_samples_leaf = [1, 2, 4]
    # Method of selecting samples for training each tree
    bootstrap = [True, False]

    # TODO: search over more params
    regression_grid = [
        #{
        #    'regress': [
        #        #KNeighborsRegressor(),
        #        #linear_model.ARDRegression(),
        #        #linear_model.BayesianRidge(),
        #        #linear_model.ElasticNet(),
        #        #linear_model.LassoLars(),
        #        #linear_model.LinearRegression(),
        #        #linear_model.Ridge(),
        #        #linear_model.SGDRegressor(),
        #        #tree.DecisionTreeRegressor(),
        #        #ensemble.AdaBoostRegressor(),
        #        #ensemble.BaggingRegressor(),
        #        #ensemble.GradientBoostingRegressor(),
        #    ]
        #},
        #{
        #    'regress': [ensemble.RandomForestRegressor()],
        #    #'regress__n_estimators': n_estimators,
        #    #'regress__max_features': max_features,
        #    #'regress__max_depth': max_depth,
        #    #'regress__min_samples_split': min_samples_split,
        #    #'regress__min_samples_leaf': min_samples_leaf,
        #    #'regress__bootstrap': bootstrap
        #},
        {
            'regress': [PLSRegression()]
        }
        #{
        #    'regress': [svm.NuSVR()],
        #    'regress__C': [1, 10, 100, 1000],
        #    'regress__kernel': ['rbf', 'linear', 'poly'],
        #},
        #{
        #    'regress': [svm.LinearSVR()],
        #    'regress__C': [1, 10, 100, 1000]
        #}
        #{
        #    'regress': [neural_network.MLPRegressor()],
        #    'regress__hidden_layer_sizes': [(100,)]
        #},
    ]

    pipeline = Pipeline(
        [
            #('fft', FFTEncoder()),
            #('reduce', DummyEstimator()),
            ('regress', DummyEstimator())
        ],
        #memory=memory
    )

    grid_steps = [
        #feature_reduction_grid,
        regression_grid
    ]
    combined_grids = get_combined_grids(grid_steps)
    print('combined_grids:')
    pprint(combined_grids)

    kfold = KFold(n_splits=NUM_FOLDS or num_rows, random_state=0)
    search = GridSearchCV(pipeline,
                          combined_grids,
                          error_score=np.nan,
                          verbose=5,
                          n_jobs=-1,
                          cv=kfold)

    print('*' * 40)
    print('Searching')
    print('*' * 40)
    start = timer()
    search.fit(X, Y)
    end = timer()
    print('Finished in: {}'.format(end - start))

    best_estimator = search.best_estimator_
    best_params = search.best_params_
    best_score = search.best_score_
    best_index = search.best_index_
    best_std = search.cv_results_['std_test_score'][best_index]
    print('best_estimator:', best_estimator)
    print('best_params:', best_params)
    print('best_score:', best_score)
    print('best_std:', best_std)

    return Pipeline([('features', features), ('estimator', best_estimator)])