def lasso(self,training,target,feature_index_list): clf=Lasso(self.alpha,fit_intercept=False) clf.fit(training,target) coef=np.zeros(self.n_features) for index,feature_index in enumerate(feature_index_list): coef[feature_index]=clf.coef_[index] return coef
def lasso_regression(features, solutions, verbose=0): columns = solutions.columns clf = Lasso(alpha=1e-4, max_iter=5000) print('Training Model... ') clf.fit(features, solutions) feature_coeff = clf.coef_ features_importances = np.zeros((169, 3)) for idx in range(3): features_importance = np.reshape(feature_coeff[idx, :], (169, 8)) features_importance = np.max(features_importance, axis=1) features_importances[:, idx] = features_importance features_importance_max = np.max(features_importances, axis=1) features_importance_max = np.reshape(features_importance_max, (13, 13)) plt.pcolor(features_importance_max) plt.title("Feature importance for HoG") plt.colorbar() plt.xticks(arange(0.5,13.5), range(1, 14)) plt.yticks(arange(0.5,13.5), range(1, 14)) plt.axis([0, 13, 0, 13]) plt.show() print('Done Training') return (clf, columns)
def train(self, x, y, param_names, random_search=100, **kwargs): start = time.time() scaled_x = self._set_and_preprocess(x=x, param_names=param_names) # Check that each input is between 0 and 1 self._check_scaling(scaled_x=scaled_x) if self._debug: print "Shape of training data: ", scaled_x.shape print "Param names: ", self._used_param_names print "First training sample\n", scaled_x[0] print "Encode: ", self._encode # Do a random search alpha = self._random_search(random_iter=100, x=scaled_x, y=y) # Now train model lasso = Lasso(alpha=alpha, fit_intercept=True, normalize=False, precompute='auto', copy_X=True, max_iter=1000, tol=0.0001, warm_start=False, positive=False) lasso.fit(scaled_x, y) self._model = lasso duration = time.time() - start self._training_finished = True return duration
def reg_skl_lasso(param, data): [X_tr, X_cv, y_class_tr, y_class_cv, y_reg_tr, y_reg_cv] = data lasso = Lasso(alpha=param["alpha"], normalize=True) lasso.fit(X_tr, y_reg_tr) pred = lasso.predict(X_cv) RMSEScore = getscoreRMSE(y_reg_cv, pred) return RMSEScore, pred
def lassoreg(a): print ("Doing lasso regression") clf2 = Lasso(alpha=a) clf2.fit(base_X, base_Y) print ("Score = %f" % clf2.score(base_X, base_Y)) clf2_pred = clf2.predict(X_test) write_to_file("lasso.csv", clf2_pred)
def traverse_movies_lasso(): LBMAP = getLBMap() DMAP = createEmpty() P_ERRORS, ERRORS = [], [] training_data, training_response = [], [] for i in range(len(data)): movie = data[i] m_rev = movie['revenue'] myvector = vectorizeMovie(movie, LBMAP, DMAP) if i > 3695: model = Lasso(alpha = .05) model.fit(training_data, training_response) raw = math.fabs(model.predict(myvector) - m_rev) ERRORS.append(raw) #P_ERRORS.append(round(raw/m_rev, 4)) training_data.append(myvector) training_response.append(m_rev) DMAP = update(movie, DMAP) #print 'all', avg_float_list(P_ERRORS) print 'all', avg_float_list(ERRORS)
def precision_recall_samples(X, y): pr_lasso = precision_recall(support.T[-1], lasso_coefs(X, y)) stability = stability_selection(X, y, pi=None) estimated = [] for st in np.unique(stability): estimated.append(stability > st - 1.e-12) pr_ss = precision_recall(support.T[-1], estimated) n_samples, n_features = X.shape alpha_max = np.max(np.dot(y, X)) / n_samples alpha = .1 * alpha_max clf = Lasso(alpha=alpha) abs_coef = np.abs(clf.fit(X, y).coef_) estimated = [] for th in np.unique(abs_coef): estimated.append(abs_coef > th - 1.e-12) pr_pt = precision_recall(support.T[-1], estimated) clf = BootstrapLasso(alpha=alpha, n_bootstraps=n_bootstraps) abs_coef = np.abs(clf.fit(X, y).coef_) estimated = [] for th in np.unique(abs_coef): estimated.append(abs_coef > th - 1.e-12) pr_bpt = precision_recall(support.T[-1], estimated) return pr_lasso, pr_ss, pr_pt, pr_bpt
def __init__(self, penalty='l1', dual=None, C=None, alpha=None): self.l1 = True if penalty=="l1" else False if self.l1: Lasso.__init__(self, alpha=alpha) else: Ridge.__init__(self, alpha=alpha)
def RunLASSOScikit(q): totalTimer = Timer() # Load input dataset. Log.Info("Loading dataset", self.verbose) inputData = np.genfromtxt(self.dataset[0], delimiter=',') responsesData = np.genfromtxt(self.dataset[1], delimiter=',') # Get all the parameters. lambda1 = re.search("-l (\d+)", options) lambda1 = 0.0 if not lambda1 else int(lambda1.group(1)) try: with totalTimer: # Perform LASSO. model = Lasso() model.fit(inputData, responsesData) out = model.coef_ except Exception as e: q.put(-1) return -1 time = totalTimer.ElapsedTime() q.put(time) return time
class SparseSelector(BaseEstimator): """ Sparse L1 based feature selection. Parameters are passed onto sklearn.linear_model.Lasso, which actually does the work. """ def __init__(self, alpha=1.0, fit_intercept=True, normalize=False): self.alpha = alpha self.fit_intercept = fit_intercept self.normalize = normalize self.lasso = None def fit(self, X, y): self.lasso = Lasso(alpha=self.alpha, fit_intercept=self.fit_intercept, normalize=self.normalize) self.lasso.fit(X, y) return self def transform(self, X): cols = np.nonzero(self.lasso.sparse_coef_)[1] if sp.sparse.issparse(X): return X.tocsc()[:, cols] else: return X[:, cols] def fit_transform(self, X, y): self.fit(X, y) return self.transform(X)
def fit(self, sklearn_alpha=None, **lasso_args): """ Fit the lasso using `Lasso` from `sklearn`. This sets the attribute `soln` and forms the constraints necessary for post-selection inference by calling `form_constraints()`. Parameters ---------- sklearn_alpha : float Lagrange parameter, in the normalization set by `sklearn`. lasso_args : keyword args Passed to `sklearn.linear_model.Lasso`_ Returns ------- soln : np.float Solution to lasso with `sklearn_alpha=self.lagrange`. """ # fit Lasso using scikit-learn clf = Lasso(alpha = self.lagrange, fit_intercept = False) clf.fit(self.X, self.y, **lasso_args) self._soln = beta = clf.coef_ if not np.all(beta == 0): self.form_constraints() else: self.active = [] return self._soln
def pred_pH(train, val, test, all_vars, loop): data = (val, test, train) # variable selection pH_lassoed_vars = lass_varselect(train, all_vars, 'pH', .00000001) univ_selector = SelectKBest(score_func = f_regression, k = 1200) univ_selector.fit(train[all_vars], train['pH']) pvals = univ_selector.get_support() chosen = [] for x in range(0, len(all_vars)): if pH_lassoed_vars[x] | pvals[x]: chosen.append(all_vars[x]) lass_only = [] for x in range(0, len(all_vars)): if pH_lassoed_vars[x]: lass_only.append(all_vars[x]) # nearest randomforest neigh = RandomForestRegressor(n_estimators=100) neigh.fit(train.ix[:, chosen], train['pH']) for dset in data: dset['pH_for_prds'] = neigh.predict(dset.ix[:, chosen]) # lasso lass = Lasso(alpha=.000000275, positive=True) lass.fit(train[all_vars], train['pH']) for dset in data: dset['pH_las_prds'] = lass.predict(dset[all_vars]) # ridge pH_ridge = RidgeCV(np.array([.6]), normalize=True) pH_ridge.fit(train[all_vars], train['pH']) for dset in data: dset['pH_rdg_prds'] = pH_ridge.predict(dset[all_vars]) # combination models= [ 'pH_rdg_prds', 'pH_las_prds', 'pH_for_prds', 'pH_for_prds' ] name = 'pH_prds' + str(object=loop) write_preds(models, name, train, val, test, 'pH')
def lassoRegression(X,y): print("\n### ~~~~~~~~~~~~~~~~~~~~ ###") print("Lasso Regression") ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### myDegree = 40 polynomialFeatures = PolynomialFeatures(degree=myDegree, include_bias=False) Xp = polynomialFeatures.fit_transform(X) myScaler = StandardScaler() scaled_Xp = myScaler.fit_transform(Xp) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### lassoRegression = Lasso(alpha=1e-7) lassoRegression.fit(scaled_Xp,y) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### dummyX = np.arange(0,2,0.01) dummyX = dummyX.reshape((dummyX.shape[0],1)) dummyXp = polynomialFeatures.fit_transform(dummyX) scaled_dummyXp = myScaler.transform(dummyXp) dummyY = lassoRegression.predict(scaled_dummyXp) outputFILE = 'plot-lassoRegression.png' fig, ax = plt.subplots() fig.set_size_inches(h = 6.0, w = 10.0) ax.axis([0,2,0,15]) ax.scatter(X,y,color="black",s=10.0) ax.plot(dummyX, dummyY, color='red', linewidth=1.5) plt.savefig(filename = outputFILE, bbox_inches='tight', pad_inches=0.2, dpi = 600) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### return( None )
def build_linmodel(self): x = 'nba_15season_all_150928.csv' self.train, self.test, self.id_df = lin_encode( filename = self.name, min_cutoff = self.min_cutoff, TRANSFORM_CUTOFF = self.t_cutoff) #Create validation set np.random.seed(2) self.test = self.test.reindex(np.random.permutation(self.test.index)) self.test = self.test.iloc[:self.test.shape[0]/2,:] self.test = self.test.reset_index().drop('index', axis = 1) """ # TESTING GROUNDS bestcol = [...put list of columns here for testing features...] bestcol = bestcol + ['points'] """ # Use these next two lines if you want to filter out these aggregates bestcol = np.logical_not(self.train.columns.str.contains( '_std|_max|'+\ '_min|_5std|'+\ '_5max|_5min|'+\ 'Unnamed')) bestcol = self.train.columns[bestcol] # Subset of features you want to train on self.train = self.train[bestcol] self.test = self.test[bestcol] print self.train.shape, self.test.shape ### print 'train shape and test shape', self.train.shape, self.test.shape X = self.train.as_matrix(self.train.columns[:-1]).astype(float) y = self.train.as_matrix(['points'])[:, 0].astype(float) X_test = self.test.as_matrix(self.test.columns[:-1]).astype(float) self.y_test = self.test.as_matrix(['points'])[:, 0].astype(float) # Choose which type of linear regression to test if self.lintype == 'Linear': self.lr = LinearRegression(**self.params) elif self.lintype == 'Ridge': self.lr = Ridge(**self.params) elif self.lintype == 'Lasso': self.lr = Lasso(**self.params) else: return "Error: Choose lin. reg. type: 'Linear', 'Ridge', 'Lasso'" self.lr.fit(X, y) self.y_pred = self.lr.predict(X_test) error = mean_squared_error(self.y_pred, self.y_test)**0.5 print 'RMSE:', error # Getting attributes from LinearRegression() coef = self.lr.coef_ self.coef_imp = pd.DataFrame({'feature': self.train.columns[:-1], 'coefficient': coef}) self.coef_imp = self.coef_imp.sort('coefficient', ascending = False) self.coef_imp = self.coef_imp.reset_index().drop('index', axis = 1) self.intercept = self.lr.intercept_
def classify(self): """Perform classification""" clf = Lasso(max_iter=10000000) #parameters = {'alpha':[0.001,0.005,0.01,0.05,0.1,0.5,1,5.0,10.0]} #clf = GridSearchCV(lasso, parameters,scoring='roc_auc') clf.fit(self._ClassifyDriver__traindata, self._ClassifyDriver__trainlabels) self._ClassifyDriver__y = clf.predict(self._ClassifyDriver__testdata)
def varselect_w_lass(all_vars_list, selected_vars, alpha_val): lass = Lasso(alpha=alpha_val, positive=True, max_iter=100000 , tol=.0001) lass.fit(np.array(fire_train_TRAIN_smp[all_vars_list]), np.array(fire_train_TRAIN_smp.target )) for x in range(1, len(all_vars_list)): if lass.coef_[x]> .00000001: selected_vars.append(all_vars_list[x])
def trainModel(x, y, degree=1): """Self designed Explicit method to train the model using linear regression.""" #poly = PolynomialFeatures(degree) #z = poly.fit_transform(x) #return np.dot(np.linalg.pinv(z), y) clf = Lasso(alpha=.5) clf.fit(x, y) return clf
def lasso(data, targets): """ Returns a Lasso linear model for predictions with alpha 0.1 Takes the data and the associated targets as arguments. """ model = Lasso(alpha=0.1) model.fit(data, targets) return model
def weight_analysis(verbose=0, stack_option='s'): logging.info('starting ensemble weight analysis') stack = STACK if stack_option == 's' else MODELS pool = multiprocessing.Pool(processes=4) drivers = settings.DRIVER_IDS#[:1000] CUTOFF = -1 results = pool.map( compute_weights, map(lambda x: (x, verbose, stack_option), drivers) ) predictions = {} for i, get_data, model, _ in stack: predictions[i] = np.array(list(itertools.chain(*[r[1][i] for r in results]))) testY = list(itertools.chain(*[r[2] for r in results])) model_names = [ ('%s.%s.%s' % (get_data.func_name, model.__name__, i), i) for i, get_data, model, repeat in stack ] model_names.sort(key=lambda x: x[0]) keys = [x[1] for x in model_names] model_names = [x[0] for x in model_names] lasso = Lasso(alpha=0.0, positive=True) trainX = [] for row_id in xrange(len(testY)): train_row = [predictions[i][row_id] for i in keys] trainX.append(train_row) a, b = trainX[:CUTOFF], trainX[CUTOFF:] c, d = testY[:CUTOFF], testY[CUTOFF:] lasso.fit(a, c) pred = lasso.predict(b) pred_train = lasso.predict(a) #logging.info('auc: %s' % util.compute_auc(d, pred)) logging.info('coefficients:') weights = {} for i, name in enumerate(model_names): logging.info('%s: %.3f' % (model_names[i], lasso.coef_[i])) weights[keys[i]] = lasso.coef_[i] logging.info('individual scores:') for i, key in enumerate(keys): logging.info('%s: %.3f' % ( model_names[i], util.compute_auc(testY, predictions[key]) )) logging.info('weights dictionary: %s' % weights) # and again in the end, so you don't have to scroll logging.info('------------') #logging.info('auc: %s' % util.compute_auc(d, pred)) logging.info('auc train: %s' % util.compute_auc(c, pred_train))
def Lasso_Regression(kf,data,label,k): val=0 for train, test in kf: X_train, X_test, y_train, y_test = data[train,:], data[test,:], label[train], label[test] log = Lasso(alpha=0.1) logit = log.fit(X_train,y_train) y_pred = logit.predict(X_test) val+= metrics.mean_squared_error(y_test, y_pred) return val/3
def comparaison_ridge_lasso(X,Y): X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3,random_state=random.seed()) clf_lasso = Lasso(selection='random', random_state=random.seed()) clf_ridge = Ridge() clf_lasso.fit(X_train,Y_train) clf_ridge.fit(X_train,Y_train) score_lasso=clf_lasso.score(X_test,Y_test) score_ridge=clf_ridge.score(X_test,Y_test) print("Precision de Lasso={:3.2f}% \nPrecision de Ridge={:3.2f}%\n".format(score_lasso*100,score_ridge*100))
def trainModel_phase2(x, y, degree=1): """Self designed Explicit method to train the model using linear regression.""" #poly = PolynomialFeatures(degree) #z = poly.fit_transform(x) #return np.dot(np.linalg.pinv(z), y) #clf = BernoulliRBM() #clf = LinearRegression() clf = Lasso(alpha=.5) clf.fit(x.reshape(-1, 1), y) return clf
class Linear(): def __init__(self, type='Ridge', alpha=3, C=1.0, nu=0.2, limit=None, \ epsilon=0.1): self.limit = limit if type == 'Ridge': self.model = Ridge(alpha=alpha) elif type == 'SVR': self.model = SVR(kernel='linear', C=C, epsilon=epsilon) elif type == 'NuSVR': self.model = NuSVR(C=C, nu=nu, kernel='linear') elif type == 'Lasso': self.model = Lasso(alpha=alpha) @staticmethod def get_cal(m): # get calitative features # watch out as indices depend on feature vector! return np.hstack((m[:,:23], m[:,24:37], m[:,38:52])) + 1 @staticmethod def get_cant(m): # get cantitative features # watch out as indices depend on feature vector! return np.hstack((m[:,23:24], m[:,37:38], m[:,52:])) def fit(self, train_X, train_Y): # no fitting done here, just saving data if self.limit: if len(train_X) > self.limit: train_X = train_X[-self.limit:] train_Y = train_Y[-self.limit:] self.train_X = np.array(train_X) self.train_Y = np.array(train_Y) def predict(self, test_X): # fitting done here # not efficient on the long term test_X = np.array(test_X) enc = OneHotEncoder() scal = MinMaxScaler() data = np.vstack((self.train_X, test_X)) enc.fit(self.get_cal(data)) scal.fit(self.get_cant(data)) new_train_X1 = enc.transform(self.get_cal(self.train_X)) new_train_X2 = scal.transform(self.get_cant(self.train_X)) new_train_X = scipy.sparse.hstack((new_train_X1, new_train_X2)) new_test_X1 = enc.transform(self.get_cal(test_X)) new_test_X2 = scal.transform(self.get_cant(test_X)) new_test_X = scipy.sparse.hstack((new_test_X1, new_test_X2)) self.model.fit(new_train_X, self.train_Y) R = self.model.predict(new_test_X) return R
def test_lasso_regression(): datafile_viper = '../data_viper/viper.pkl' viper = loadfile(datafile_viper) from sklearn.linear_model import Lasso model = Lasso(alpha=1e-3) model.fit(viper.train_feat, viper.train_y) y_pred = model.predict(viper.test_feat) print 'testing error {}'.format(abs_error(y_pred, viper.test_y))
def __init__(self, type='Ridge', alpha=3, C=1.0, nu=0.2, limit=None, \ epsilon=0.1): self.limit = limit if type == 'Ridge': self.model = Ridge(alpha=alpha) elif type == 'SVR': self.model = SVR(kernel='linear', C=C, epsilon=epsilon) elif type == 'NuSVR': self.model = NuSVR(C=C, nu=nu, kernel='linear') elif type == 'Lasso': self.model = Lasso(alpha=alpha)
def fit_predict_model(l1_penalty): RSS = np.zeros((len(l1_penalty))) num_nonzero_coeff = np.zeros((len(l1_penalty))) idx = 0 for l1_penalty_choice in l1_penalty: model = Lasso(alpha=l1_penalty_choice, normalize=True) model.fit(training[all_features], training['price']) predicted_price = model.predict(validation[all_features]) RSS[idx] = np.sum((predicted_price - validation['price'])**2) num_nonzero_coeff[idx] = np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_) idx += 1 return (RSS, num_nonzero_coeff, model)
def main(folds = 5): print "folds: ", folds #read in data, parse into training and target sets print "\n ------------------Load file --------------- \n" train = np.loadtxt(sys.argv[1]).T min_max_scaler = preprocessing.MinMaxScaler() train = min_max_scaler.fit_transform(train) #test data set xtest = train[100:112, :] train = train[0:100, :] print "Size of read data: ", train.shape #train = imputation_missingValue(train) print "After Standardization:" print train target = np.loadtxt(sys.argv[2]).T ytest = target[100:112, :] target = target[0:100,:] print "Size of read data: ", target.shape al = 0.3 rf = Lasso(alpha=al) #Simple K-Fold cross validation. cv = cross_validation.KFold(len(train), folds) #iterate through the training and test cross validation segments and #run the classifier on each one, aggregating the results into a list results = [] i = 0 min_MSE = sys.maxint best_train = -1 best_test = -1 for traincv, testcv in cv: start = timeit.default_timer() i += 1 print i, "epoch" rf.fit(train[traincv], target[traincv]) prediction = rf.predict(train[testcv]) MSE = mean_squared_error(target[testcv], prediction) print "MSE: ", MSE, " for ",i if min_MSE > MSE: best_train = traincv best_test = testcv min_MSE = MSE results.append(MSE) stop = timeit.default_timer() print "Program running time: ", stop - start #print out the mean of the cross-validated results print "Results: " + str( np.array(results).mean() ), "for folds: ", folds print "Results for independent data: ", mean_squared_error(rf.fit(train[best_train], target[best_train]).predict(xtest), ytest) print "R squared:" print "alpha:", al
def lasso_regression(alpha): #Fit the model lassoreg = Lasso(alpha=alpha,normalize=True, max_iter=1e5) lassoreg.fit(A_x, A_y) y_pred = lassoreg.predict(A_x) #Return the result in pre-defined format rss = sum((y_pred-A_y)**2) ret = [rss] ret.extend([lassoreg.intercept_]) ret.extend(lassoreg.coef_) return ret
def lasso_regression(data, predictors, alpha): #Fit the model lassoreg = Lasso(alpha=alpha,normalize=True, max_iter=1e5) lassoreg.fit(data[predictors],data['TransformedLife']) y_pred = lassoreg.predict(data[predictors]) #Return the result in pre-defined format rss = sum((y_pred-data['TransformedLife'])**2) ret = [rss] ret.extend([lassoreg.intercept_]) ret.extend(lassoreg.coef_) return ret
def basispursuit(y, F, penalty=0.1): """ solves basic (vanilla) basis pursuit using scikit-learn """ clf = Lasso(alpha=penalty, fit_intercept=False) clf.fit(F, y) xhat = clf.coef_ # reconstruct yhat = F.dot(xhat) return xhat, yhat
def fit(self, K, s): r"""Fit the model using the coordinate descent method from scikit-learn. Args ---- K: ndarray The :math:`m \times n` kernel matrix, :math:`{\bf K}`. A numpy array of shape (m, n). s: ndarray or CSDM object. A csdm object or an equivalent numpy array holding the signal, :math:`{\bf s}`, as a :math:`m \times m_\text{count}` matrix. """ s_, self.scale = prepare_signal(s) prod = np.asarray(self.f_shape).prod() if K.shape[1] != prod: raise ValueError( "The product of the shape, `f_shape`, must be equal to the length of " f"the axis 1 of kernel, K, {K.shape[1]} != {prod}.") alpha = s_.size * self.hyperparameters["alpha"] Ks, ss = _get_augmented_data(K=K, s=s_, alpha=alpha, regularizer=self.regularizer, f_shape=self.f_shape) # The factor 0.5 for alpha in the Lasso/LassoLars problem is to compensate # 1/(2 * n_sample) factor in OLS term if self.method == "multi-task": estimator = MultiTaskLasso( alpha=self.hyperparameters["lambda"] / 2.0, fit_intercept=False, copy_X=True, max_iter=self.max_iterations, tol=self.tolerance, warm_start=False, random_state=None, selection="random", # positive=self.positive, ) if self.method == "gradient_decent": estimator = Lasso( alpha=self.hyperparameters["lambda"] / 2.0, fit_intercept=False, copy_X=True, max_iter=self.max_iterations, tol=self.tolerance, warm_start=False, random_state=None, selection="random", positive=self.positive, ) if self.method == "lars": estimator = LassoLars( alpha=self.hyperparameters["lambda"] / 2.0, fit_intercept=False, verbose=True, # normalize=False, precompute=True, max_iter=self.max_iterations, eps=2.220446049250313e-16, copy_X=True, fit_path=False, positive=True, jitter=None, random_state=None, ) estimator.fit(Ks, ss) f = estimator.coef_.copy() if s_.shape[1] > 1 and len(self.f_shape) == 2: f.shape = (s_.shape[1], ) + self.f_shape f[:, :, 0] /= 2.0 f[:, 0, :] /= 2.0 elif s_.shape[1] == 1 and len(self.f_shape) == 2: f.shape = self.f_shape f[:, 0] /= 2.0 f[0, :] /= 2.0 f *= self.scale self.estimator = estimator self.f = f self.n_iter = estimator.n_iter_ self._sol_to_csdm(s)
def main(): # Parse command line arguments parser = argparse.ArgumentParser( description='Map word embeddings in two languages into a shared space') parser.add_argument('src_input', help='the input source embeddings') parser.add_argument('trg_input', help='the input target embeddings') parser.add_argument('sense_input', help='the input sense mapping matrix') parser.add_argument('src_output', help='the output source embeddings') parser.add_argument('trg_output', help='the output target embeddings') parser.add_argument('tsns_output', default='tsns.pkl', help='the output target senses pickle file') parser.add_argument( '--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)') parser.add_argument('--precision', choices=['fp16', 'fp32', 'fp64'], default='fp32', help='the floating-point precision (defaults to fp32)') parser.add_argument('--cuda', action='store_true', help='use cuda (requires cupy)') parser.add_argument('--seed', type=int, default=0, help='the random seed (defaults to 0)') recommended_group = parser.add_argument_group( 'recommended settings', 'Recommended settings for different scenarios') recommended_type = recommended_group.add_mutually_exclusive_group() recommended_type.add_argument( '--unsupervised', action='store_true', help= 'recommended if you have no seed dictionary and do not want to rely on identical words' ) recommended_type.add_argument('--future', action='store_true', help='experiment with stuff') recommended_type.add_argument('--toy', action='store_true', help='experiment with stuff on toy dataset') recommended_type.add_argument('--acl2018', action='store_true', help='reproduce our ACL 2018 system') init_group = parser.add_argument_group( 'advanced initialization arguments', 'Advanced initialization arguments') init_type = init_group.add_mutually_exclusive_group() init_type.add_argument('--init_unsupervised', action='store_true', help='use unsupervised initialization') init_group.add_argument( '--unsupervised_vocab', type=int, default=0, help= 'restrict the vocabulary to the top k entries for unsupervised initialization' ) mapping_group = parser.add_argument_group( 'advanced mapping arguments', 'Advanced embedding mapping arguments') mapping_group.add_argument( '--normalize', choices=['unit', 'center', 'unitdim', 'centeremb', 'none'], nargs='*', default=[], help='the normalization actions to perform in order') mapping_group.add_argument('--whiten', action='store_true', help='whiten the embeddings') mapping_group.add_argument('--src_reweight', type=float, default=0, nargs='?', const=1, help='re-weight the source language embeddings') mapping_group.add_argument('--trg_reweight', type=float, default=0, nargs='?', const=1, help='re-weight the target language embeddings') mapping_group.add_argument('--src_dewhiten', choices=['src', 'trg'], help='de-whiten the source language embeddings') mapping_group.add_argument('--trg_dewhiten', choices=['src', 'trg'], help='de-whiten the target language embeddings') mapping_group.add_argument('--dim_reduction', type=int, default=0, help='apply dimensionality reduction') mapping_type = mapping_group.add_mutually_exclusive_group() mapping_type.add_argument('-c', '--orthogonal', action='store_true', help='use orthogonal constrained mapping') self_learning_group = parser.add_argument_group( 'advanced self-learning arguments', 'Advanced arguments for self-learning') self_learning_group.add_argument( '--vocabulary_cutoff', type=int, default=0, help='restrict the vocabulary to the top k entries') self_learning_group.add_argument( '--threshold', default=0.000001, type=float, help='the convergence threshold (defaults to 0.000001)') self_learning_group.add_argument( '--stochastic_initial', default=0.1, type=float, help= 'initial keep probability stochastic dictionary induction (defaults to 0.1)' ) self_learning_group.add_argument( '--stochastic_multiplier', default=2.0, type=float, help='stochastic dictionary induction multiplier (defaults to 2.0)') self_learning_group.add_argument( '--stochastic_interval', default=50, type=int, help='stochastic dictionary induction interval (defaults to 50)') self_learning_group.add_argument( '--log', default='map.log', help='write to a log file in tsv format at each iteration') self_learning_group.add_argument( '-v', '--verbose', action='store_true', help='write log information to stderr at each iteration') future_group = parser.add_argument_group('experimental arguments', 'Experimental arguments') future_group.add_argument('--skip_top', type=int, default=0, help='Top k words to skip, presumably function') future_group.add_argument( '--start_src', action='store_true', help='Algorithm starts by tuning sense embeddings based on source') future_group.add_argument('--trim_senses', action='store_true', help='Trim sense table to working vocab') future_group.add_argument( '--lamb', type=float, default=0.5, help='Weight hyperparameter for sense alignment objectives') future_group.add_argument('--reglamb', type=float, default=1., help='Lasso regularization hyperparameter') future_group.add_argument( '--ccreglamb', type=float, default=0.1, help='Sense embedding regularization hyperparameter') future_group.add_argument('--inv_delta', type=float, default=0.0001, help='Delta_I added for inverting sense matrix') future_group.add_argument('--lasso_iters', type=int, default=10, help='Number of iterations for LASSO/NMF') future_group.add_argument('--iterations', type=int, default=-1, help='Number of overall model iterations') future_group.add_argument('--trg_batch', type=int, default=5000, help='Batch size for target steps') future_group.add_argument( '--trg_knn', action='store_true', help='Perform target sense mapping by k-nearest neighbors') future_group.add_argument( '--trg_sns_csls', type=int, default=10, help='K-nearest neighbors for CSLS target sense search') future_group.add_argument( '--senses_per_trg', type=int, default=1, help='K-max target sense mapping (default = 1 = off)') future_group.add_argument( '--gd', action='store_true', help='Apply gradient descent for assignment and synset embeddings') future_group.add_argument('--gd_lr', type=float, default=1e-2, help='Learning rate for SGD (default=0.01)') future_group.add_argument('--gd_wd', action='store_true', help='Weight decay in SGD') future_group.add_argument( '--gd_wd_hl', type=int, default=100, help='Weight decay half-life in SGD, default=100') future_group.add_argument( '--gd_clip', type=float, default=5., help='Per-coordinate gradient clipping (default=5)') future_group.add_argument( '--gd_map_steps', type=int, default=1, help='Consecutive steps for each target-sense mapping update phase') future_group.add_argument( '--gd_emb_steps', type=int, default=1, help='Consecutive steps for each sense embedding update phase') future_group.add_argument( '--base_prox_lambda', type=float, default=0.99, help='Lambda for proximal gradient in lasso step') future_group.add_argument( '--prox_decay', action='store_true', help='Multiply proximal lambda by itself each iteration') future_group.add_argument( '--sense_limit', type=float, default=1.1, help= 'Maximum amount of target sense mappings, in terms of source mappings (default=1.1x)' ) future_group.add_argument( '--gold_pairs', help='Gold data for evaluation, if exists (not for tuning)') future_group.add_argument( '--gold_threshold', type=float, default=0.0, help='Threshold for gold mapping (0 is fine if sparse)') future_group.add_argument('--debug', action='store_true') args = parser.parse_args() # pre-setting groups if args.toy: parser.set_defaults(init_unsupervised=True, unsupervised_vocab=4000, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', vocabulary_cutoff=50, trim_senses=True, inv_delta=1., reglamb=0.2, lasso_iters=100, gd_wd=True, log='map-toy.log') if args.unsupervised or args.future: parser.set_defaults(init_unsupervised=True, unsupervised_vocab=4000, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', vocabulary_cutoff=2000, trim_senses=True, gd_wd=True) if args.unsupervised or args.acl2018: parser.set_defaults(init_unsupervised=True, unsupervised_vocab=4000, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', vocabulary_cutoff=20000) args = parser.parse_args() # Check command line arguments if (args.src_dewhiten is not None or args.trg_dewhiten is not None) and not args.whiten: print('ERROR: De-whitening requires whitening first', file=sys.stderr) sys.exit(-1) # Choose the right dtype for the desired precision if args.precision == 'fp16': dtype = 'float16' # many operations not supported by cupy elif args.precision == 'fp32': # default dtype = 'float32' elif args.precision == 'fp64': dtype = 'float64' # Read input embeddings print('reading embeddings...') srcfile = open(args.src_input, encoding=args.encoding, errors='surrogateescape') trgfile = open(args.trg_input, encoding=args.encoding, errors='surrogateescape') src_words, x = embeddings.read(srcfile, dtype=dtype) trg_words, z = embeddings.read(trgfile, dtype=dtype) print('embeddings read') # Read input source sense mapping print('reading sense mapping') src_senses = pickle.load(open(args.sense_input, 'rb')) if src_senses.shape[0] != x.shape[0]: src_senses = csr_matrix(src_senses.transpose() ) # using non-cuda scipy because of 'inv' impl #src_senses = get_sparse_module(src_senses) print( f'source sense mapping of shape {src_senses.shape} loaded with {src_senses.getnnz()} nonzeros' ) # NumPy/CuPy management if args.cuda: if not supports_cupy(): print('ERROR: Install CuPy for CUDA support', file=sys.stderr) sys.exit(-1) xp = get_cupy() x = xp.asarray(x) z = xp.asarray(z) print('CUDA loaded') else: xp = np xp.random.seed(args.seed) # removed word to index map (only relevant in supervised learning or with validation) # STEP 0: Normalization embeddings.normalize(x, args.normalize) embeddings.normalize(z, args.normalize) print('normalization complete') # removed building the seed dictionary # removed validation step # Create log file if args.log: log = open(args.log, mode='w', encoding=args.encoding, errors='surrogateescape') print(f'logging into {args.log}') # Allocate memory # Initialize the projection matrices W(s) = W(t) = I. xw = xp.empty_like(x) zw = xp.empty_like(z) xw[:] = x zw[:] = z src_size = x.shape[0] if args.vocabulary_cutoff <= 0 else min( x.shape[0] - args.skip_top, args.vocabulary_cutoff) trg_size = z.shape[0] if args.vocabulary_cutoff <= 0 else min( z.shape[0] - args.skip_top, args.vocabulary_cutoff) emb_dim = x.shape[1] cutoff_end = min(src_size + args.skip_top, x.shape[0]) if args.trim_senses: # reshape sense assignment src_senses = src_senses[args.skip_top:cutoff_end] # new columns for words with no senses in original input ### TODO might also need this if not trimming (probably kinda far away) newcols = [csc_matrix(([1],([i],[0])),shape=(src_size,1)) for i in range(src_size)\ if src_senses.getrow(i).getnnz() == 0] #with open(f'data/synsets/dummy_synsets_v3b_{src_size}','wb') as dummy_cols_file: # dummy_col_idcs = [i for i in range(src_size) if src_senses.getrow(i).getnnz() == 0] # pickle.dump(np.array(dummy_col_idcs), dummy_cols_file) # trim senses no longer used, add new ones colsums = src_senses.sum(axis=0).tolist()[0] kept_senses = [i for i, j in enumerate(colsums) if j > 0] #with open(f'data/synsets/kept_synsets_v3b_{src_size}','wb') as kept_save_file: # pickle.dump(np.array(kept_senses), kept_save_file) src_senses = hstack([src_senses[:, kept_senses]] + newcols) print( f'trimmed sense dictionary dimensions: {src_senses.shape} with {src_senses.getnnz()} nonzeros' ) sense_size = src_senses.shape[1] if args.gold_pairs is not None: with open(args.gold_pairs, 'rb') as gold_pairs_f: gold_pairs = pickle.load(gold_pairs_f) gold_pairs = [(i-args.skip_top,j) for i,j in gold_pairs \ if i >= args.skip_top and i < src_senses.shape[0] and j < src_senses.shape[1]] gold_trgs = sorted(set([x[0] for x in gold_pairs])) gold_senses = sorted(set([x[1] for x in gold_pairs])) gold_domain_size = len(gold_trgs) * len(gold_senses) print( f'evaluating on {len(gold_pairs)} pairs with {len(gold_trgs)} unique words and {len(gold_senses)} unique senses' ) # Initialize the concept embeddings from the source embeddings ### TODO maybe try gradient descent instead? ### TODO (pre-)create non-singular alignment matrix cc = xp.empty((sense_size, emb_dim), dtype=dtype) # \tilde{E} t01 = time.time() print('starting psinv calc') src_sns_psinv = psinv(src_senses, dtype, args.inv_delta) xecc = x[args.skip_top:cutoff_end].T.dot( get_sparse_module(src_senses).toarray()).T # sense_size * emb_dim cc[:] = src_sns_psinv.dot(xecc) print(f'initialized concept embeddings in {time.time()-t01:.2f} seconds', file=sys.stderr) if args.verbose: # report precision of psedo-inverse operation, checked by inverting pseudo_id = src_senses.transpose().dot(src_senses).dot( src_sns_psinv.get()) real_id = sparse_id(sense_size) rel_diff = (pseudo_id - real_id).sum() / (sense_size * sense_size) print(f'per-coordinate pseudo-inverse precision is {rel_diff:.5f}') ### TODO initialize trg_senses using seed dictionary instead? trg_sns_size = trg_size if args.trim_senses else z.shape[0] trg_senses = csr_matrix( (trg_sns_size, sense_size)) # using non-cuda scipy because of 'inv' impl zecc = xp.empty_like(xecc) # sense_size * emb_dim #tg_grad = xp.empty((trg_sns_size, sense_size)) if args.gd: # everything can be done on gpu src_senses = get_sparse_module(src_senses, dtype=dtype) trg_senses = get_sparse_module(trg_senses, dtype=dtype) if args.sense_limit > 0.0: trg_sense_limit = int(args.sense_limit * src_senses.getnnz()) if args.verbose: print( f'limiting target side to {trg_sense_limit} sense mappings' ) else: trg_sense_limit = -1 ### TODO return memory assignment for similarities? # Training loop if args.gd: prox_lambda = args.base_prox_lambda else: lasso_model = Lasso(alpha=args.reglamb, fit_intercept=False, max_iter=args.lasso_iters,\ positive=True, warm_start=True) # TODO more parametrization if args.log is not None: if args.gd: print(f'gradient descent lr: {args.gd_lr}', file=log) print(f'base proximal lambda: {args.base_prox_lambda}', file=log) else: print(f'lasso regularization: {args.reglamb}', file=log) print(f'lasso iterations: {args.lasso_iters}', file=log) print(f'inversion epsilon: {args.inv_delta}', file=log) if args.gold_pairs is not None: print(f'gold mappings: {len(gold_pairs)}', file=log) print( f'Iteration\tObjective\tSource\tTarget\tL_1\tDuration\tNonzeros\tCorrect_mappings', file=log) log.flush() best_objective = objective = 1000000000. correct_mappings = -1 regularization_lambda = args.base_prox_lambda if args.gd else args.reglamb it = 1 last_improvement = 0 t = time.time() map_gd_lr = args.gd_lr emb_gd_lr = args.gd_lr end = False print('starting training') if args.start_src: print('starting with converging synset embeddings') it_range = range( args.iterations ) ### TODO possibly add arg, but there's early stopping if not args.verbose: it_range = tqdm(it_range) prev_obj = float('inf') for pre_it in it_range: if args.gd_wd: emb_gd_lr = args.gd_lr * pow(0.5, floor( pre_it / args.gd_wd_hl)) # Synset embedding cc_grad = src_senses.T.dot( xw[args.skip_top:cutoff_end] - src_senses.dot(cc)) - args.ccreglamb * cc cc_grad.clip(-args.gd_clip, args.gd_clip, out=cc_grad) cc += emb_gd_lr * cc_grad # Source projection u, s, vt = xp.linalg.svd(cc.T.dot(xecc)) wx = vt.T.dot(u.T).astype(dtype) x.dot(wx, out=xw) pre_objective = ((xp.linalg.norm( xw[args.skip_top:cutoff_end] - get_sparse_module(src_senses).dot(cc), 'fro'))**2) / 2 pre_objective = float(pre_objective) if args.verbose and pre_it > 0 and pre_it % 10 == 0: print( f'source synset embedding objective iteration {pre_it}: {pre_objective:.3f}' ) if pre_objective > prev_obj: print( f'stopping at pre-iteration {pre_it}, source-sense objective {prev_obj:.3f}' ) # revert cc -= emb_gd_lr * cc_grad break prev_obj = pre_objective while True: if it % 50 == 0: print( f'starting iteration {it}, last objective was {objective}, correct mappings at {correct_mappings}' ) # Increase the keep probability if we have not improved in args.stochastic_interval iterations if it - last_improvement > args.stochastic_interval: last_improvement = it if args.iterations > 0 and it > args.iterations: end = True ### update target assignments (6) - lasso-esque regression time6 = time.time() # optimize: 0.5 * (xp.linalg.norm(zw[i] - trg_senses[i].dot(cc))^2) + (regularization_lambda * xp.linalg.norm(trg_senses[i],1)) if args.trg_knn: # for csls-based neighborhoods knn_sense = xp.full(sense_size, -100) for i in range(0, sense_size, args.trg_batch): batch_end = min(i + args.trg_batch, sense_size) sim_sense_trg = cc[i:batch_end].dot( zw[args.skip_top:cutoff_end].T) knn_sense[i:batch_end] = topk_mean(sim_sense_trg, k=args.trg_sns_csls, inplace=True) # calculate new target mappings trg_senses = lil_matrix(trg_senses.shape) for i in range(0, trg_size, args.trg_batch): sns_batch_end = min(i + args.trg_batch, trg_size) z_i = i + args.skip_top z_batch_end = min(sns_batch_end + args.skip_top, zw.shape[0]) sims = zw[z_i:z_batch_end].dot(cc.T) sims -= knn_sense / 2 # equivalent to the real CSLS scores for NN best_idcs = sims.argmax(1).tolist() trg_senses[(list(range(i, sns_batch_end)), best_idcs)] = sims.max(1).tolist() # second-to-lth-best for l in range(args.senses_per_trg - 1): sims[(list(range(sims.shape[0])), best_idcs)] = 0. best_idcs = sims.argmax(1).tolist() trg_senses[(list(range(i, sns_batch_end)), best_idcs)] = sims.max(1).tolist() trg_senses = get_sparse_module(trg_senses.tocsr()) elif args.gd: ### TODO add args.skip_top calculations if args.gd_wd: true_it = (it - 1) * args.gd_map_steps map_gd_lr = args.gd_lr * pow( 0.5, floor((1 + true_it) / args.gd_wd_hl)) if args.verbose: print(f'mapping learning rate: {map_gd_lr}') for k in range(args.gd_map_steps): # st <- st + eta * (ew - st.dot(es)).dot(es.T) # allow up to sense_limit updates, clip gradient batch_grads = [] for i in range(0, trg_size, args.trg_batch): batch_end = min(i + args.trg_batch, trg_size) tg_grad_b = (zw[i:batch_end] - trg_senses[i:batch_end].dot(cc)).dot(cc.T) # proximal gradient tg_grad_b += prox_lambda tg_grad_b.clip(None, 0.0, out=tg_grad_b) batch_grads.append(batch_sparse(tg_grad_b)) tg_grad = get_sparse_module(vstack(batch_grads)) del tg_grad_b if args.prox_decay: prox_lambda *= args.base_prox_lambda ### TODO consider weight decay here as well (args.gd_wd) trg_senses -= map_gd_lr * tg_grad # allow up to sense_limit nonzeros if trg_sense_limit > 0: trg_senses = trim_sparse(trg_senses, trg_sense_limit, clip=None) ### TODO consider finishing up with lasso (maybe only in final iteration) else: ### TODO add args.skip_top calculations # parallel LASSO (no cuda impl) cccpu = cc.get().T # emb_dim * sense_size lasso_model.fit(cccpu, zw[:trg_size].get().T) ### TODO maybe trim, keep only above some threshold (0.05) OR top f(#it) trg_senses = lasso_model.sparse_coef_ if args.verbose: print( f'target sense mapping step: {(time.time()-time6):.2f} seconds, {trg_senses.getnnz()} nonzeros', file=sys.stderr) objective = ((xp.linalg.norm(xw[args.skip_top:cutoff_end] - get_sparse_module(src_senses).dot(cc),'fro') ** 2)\ + (xp.linalg.norm(zw[args.skip_top:cutoff_end] - get_sparse_module(trg_senses).dot(cc),'fro')) ** 2) / 2 \ + regularization_lambda * trg_senses.sum() # TODO consider thresholding reg part objective = float(objective) print(f'objective: {objective:.3f}') # Write target sense mapping with open(f'tmp_outs/{args.tsns_output[:-4]}-it{it:03d}.pkl', mode='wb') as tsnsfile: pickle.dump(trg_senses.get(), tsnsfile) ### update synset embeddings (10) time10 = time.time() if args.gd and args.gd_emb_steps > 0: ### TODO probably handle sizes and/or threshold sparse matrix if args.gd_wd: true_it = (it - 1) * args.gd_emb_steps emb_gd_lr = args.gd_lr * pow( 0.5, floor((1 + true_it) / args.gd_wd_hl)) if args.verbose: print(f'embedding learning rate: {emb_gd_lr}') ### replace block for no-source-tuning mode all_senses = trg_senses if args.start_src else get_sparse_module( vstack((src_senses.get(), trg_senses.get()), format='csr'), dtype=dtype) aw = zw[args. skip_top:cutoff_end] if args.start_src else xp.concatenate( (xw[args.skip_top:cutoff_end], zw[args.skip_top:cutoff_end])) for i in range(args.gd_emb_steps): cc_grad = all_senses.T.dot( aw - all_senses.dot(cc)) - args.ccreglamb * cc cc_grad.clip(-args.gd_clip, args.gd_clip, out=cc_grad) cc += emb_gd_lr * cc_grad else: ### TODO add args.skip_top calculations all_senses = get_sparse_module( vstack((src_senses, trg_senses), format='csr')) xzecc = xp.concatenate((xw[:src_size], zw[:trg_size])).T\ .dot(all_senses.toarray()).T # sense_size * emb_dim all_sns_psinv = psinv( all_senses.get(), dtype, args.inv_delta ) ### TODO only update target side? We still have src_sns_psinv [it doesn't matter, dimensions are the same] cc[:] = all_sns_psinv.dot(xzecc) if args.verbose: print(f'synset embedding update: {time.time()-time10:.2f}', file=sys.stderr) objective = ((xp.linalg.norm(xw[args.skip_top:cutoff_end] - get_sparse_module(src_senses).dot(cc),'fro')) ** 2\ + (xp.linalg.norm(zw[args.skip_top:cutoff_end] - get_sparse_module(trg_senses).dot(cc),'fro')) ** 2) / 2 \ + regularization_lambda * trg_senses.sum() # TODO consider thresholding reg part objective = float(objective) print(f'objective: {objective:.3f}') ### update projections (3,5) # write to zw and xw if args.orthogonal or not end: ### remove block for no-source-tuning mode # source side - mappings don't change so xecc is constant #if not args.start_src: # need to do this anyway whenever cc updates time3 = time.time() u, s, vt = xp.linalg.svd(cc.T.dot(xecc)) wx = vt.T.dot(u.T).astype(dtype) x.dot(wx, out=xw) if args.verbose: print(f'source projection update: {time.time()-time3:.2f}', file=sys.stderr) # target side - compute sense mapping first time3 = time.time() zecc.fill(0.) for i in range(0, trg_size, args.trg_batch): end_idx = min(i + args.trg_batch, trg_size) zecc += z[i:end_idx].T.dot( get_sparse_module(trg_senses[i:end_idx]).toarray()).T u, s, vt = xp.linalg.svd(cc.T.dot(zecc)) wz = vt.T.dot(u.T).astype(dtype) z.dot(wz, out=zw) if args.verbose: print(f'target projection update: {time.time()-time3:.2f}', file=sys.stderr) ### TODO add parts from 'advanced mapping' part - transformations, whitening, etc. # Objective function evaluation time_obj = time.time() trg_senses_l1 = float(trg_senses.sum()) src_obj = (float( xp.linalg.norm( xw[args.skip_top:cutoff_end] - get_sparse_module(src_senses).dot(cc), 'fro'))**2) / 2 trg_obj = (float( xp.linalg.norm( zw[args.skip_top:cutoff_end] - get_sparse_module(trg_senses).dot(cc), 'fro'))**2) / 2 objective = src_obj + trg_obj + regularization_lambda * trg_senses_l1 # TODO consider thresholding reg part if args.verbose: print(f'objective calculation: {time.time()-time_obj:.2f}', file=sys.stderr) if objective - best_objective <= -args.threshold: last_improvement = it best_objective = objective # WordNet transduction evaluation (can't tune on this) if args.gold_pairs is not None: np_trg_senses = trg_senses.get() trg_corr = [ p for p in gold_pairs if np_trg_senses[p] > args.gold_threshold ] correct_mappings = len(trg_corr) domain_trgs = np_trg_senses[gold_trgs][:, gold_senses] else: correct_mappings = -1 # Logging duration = time.time() - t if args.verbose: print('ITERATION {0} ({1:.2f}s)'.format(it, duration), file=sys.stderr) print('objective: {0:.3f}'.format(objective), file=sys.stderr) print('target senses l_1 norm: {0:.3f}'.format(trg_senses_l1), file=sys.stderr) if len(gold_pairs) > 0 and domain_trgs.getnnz() > 0: print( f'{correct_mappings} correct target mappings: {(correct_mappings/len(gold_pairs)):.3f} recall, {(correct_mappings/domain_trgs.getnnz()):.3f} precision', file=sys.stderr) print(file=sys.stderr) sys.stderr.flush() if args.log is not None: print( f'{it}\t{objective:.3f}\t{src_obj:.3f}\t{trg_obj:.3f}\t{trg_senses_l1:.3f}\t{duration:.3f}\t{trg_senses.getnnz()}\t{correct_mappings}', file=log) log.flush() if end: break t = time.time() it += 1 # Write mapped embeddings with open(args.src_output, mode='w', encoding=args.encoding, errors='surrogateescape') as srcfile: embeddings.write(src_words, xw, srcfile) with open(args.trg_output, mode='w', encoding=args.encoding, errors='surrogateescape') as trgfile: embeddings.write(trg_words, zw, trgfile) # Write target sense mapping with open(args.tsns_output, mode='wb') as tsnsfile: pickle.dump(trg_senses.get(), tsnsfile)
["level", "temperature", "usage", "Brightness", "RAM"]) df_label_Num = pandas.DataFrame(polyData_Num, columns=columnNames) for column in columnNames: df_label[column] = pandas.Series(df_label_Num[column]) # Get dataframes y_label = df_label["output"] X_label = df_label.drop(["output"], axis=1) # Split data training and testing ... X_train_label, X_test_label, y_train_label, y_test_label = train_test_split( X_label, y_label, test_size=0.25, random_state=42) # Create the model regressor = Lasso() # find optimal alpha with grid search alpha = [0.001, 0.01, 0.1, 1, 10, 100, 1000] param_grid = {'alpha': alpha} scoring = ['neg_mean_absolute_error', 'neg_root_mean_squared_error'] grid = GridSearchCV(estimator=regressor, param_grid=param_grid, scoring=scoring, refit=scoring[0], return_train_score=True, cv=3) grid_result = grid.fit(X_train_label, y_train_label) print( f"Best Score: {abs(grid_result.best_score_)} - Best Params: {grid_result.best_params_} for label {label} ({df_label.shape})"
y_train = train.SalePrice.values train = pd.DataFrame(all_data[:ntrain]) test = pd.DataFrame(all_data[ntrain:]) from sklearn.linear_model import ElasticNet, Lasso, BayesianRidge, LassoLarsIC from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor from sklearn.kernel_ridge import KernelRidge from sklearn.pipeline import make_pipeline from sklearn.preprocessing import RobustScaler from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone from sklearn.model_selection import KFold, cross_val_score, train_test_split from sklearn.metrics import mean_squared_error import xgboost as xgb import lightgbm as lgb #1 lasso = make_pipeline(RobustScaler(), Lasso(alpha=0.0005, random_state=1)) #2 KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5) #3 ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3)) #4 GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=4, max_features='sqrt', min_samples_leaf=15, min_samples_split=10, loss='huber', random_state=5) #5
print(__doc__) import numpy as np import matplotlib.pyplot as plt from sklearn import datasets from sklearn.linear_model import LassoCV from sklearn.linear_model import Lasso from sklearn.model_selection import KFold from sklearn.model_selection import GridSearchCV diabetes = datasets.load_diabetes() X = diabetes.data[:150] y = diabetes.target[:150] lasso = Lasso(random_state=0) alphas = np.logspace(-4, -0.5, 30) tuned_parameters = [{'alpha': alphas}] n_folds = 3 clf = GridSearchCV(lasso, tuned_parameters, cv=n_folds, refit=False) clf.fit(X, y) scores = clf.cv_results_['mean_test_score'] scores_std = clf.cv_results_['std_test_score'] plt.figure().set_size_inches(8, 6) plt.semilogx(alphas, scores) # plot error lines showing +/- std. errors of the scores std_error = scores_std / np.sqrt(n_folds)
from sklearn.linear_model import MultiTaskLasso, Lasso rng = np.random.RandomState(42) # Generate some 2D coefficients with sine waves with random frequency and phase n_samples, n_features, n_tasks = 100, 30, 40 n_relevant_features = 5 coef = np.zeros((n_tasks, n_features)) times = np.linspace(0, 2 * np.pi, n_tasks) for k in range(n_relevant_features): coef[:, k] = np.sin((1. + rng.randn(1)) * times + 3 * rng.randn(1)) X = rng.randn(n_samples, n_features) Y = np.dot(X, coef.T) + rng.randn(n_samples, n_tasks) coef_lasso_ = np.array([Lasso(alpha=0.5).fit(X, y).coef_ for y in Y.T]) coef_multi_task_lasso_ = MultiTaskLasso(alpha=1.).fit(X, Y).coef_ ############################################################################### # Plot support and time series fig = plt.figure(figsize=(8, 5)) plt.subplot(1, 2, 1) plt.spy(coef_lasso_) plt.xlabel('Feature') plt.ylabel('Time (or Task)') plt.text(10, 5, 'Lasso') plt.subplot(1, 2, 2) plt.spy(coef_multi_task_lasso_) plt.xlabel('Feature') plt.ylabel('Time (or Task)') plt.text(10, 5, 'MultiTaskLasso')
def set_objective(self, X, y, lmbd): self.X, self.y, self.lmbd = X, y, lmbd n_samples = self.X.shape[0] self.clf = Lasso(alpha=self.lmbd/n_samples, fit_intercept=False, tol=0) warnings.filterwarnings('ignore', category=ConvergenceWarning)
def cv_predict_fixture(generate_data_cv_predict, cross_fit, params): n_folds = 4 # collect data (x, y, classifier) = generate_data_cv_predict if classifier: method = 'predict_proba' else: method = 'predict' if cross_fit: smpls = [ (train, test) for train, test in KFold(n_splits=n_folds, shuffle=True).split(x) ] else: n_obs = len(y) smpls = train_test_split(np.arange(n_obs), test_size=0.23) smpls = [[np.sort(x) for x in smpls]] # only sorted indices are supported if params is None: est_params = None elif params == 'global': if method == 'predict_proba': est_params = {'C': 0.5} else: est_params = {'alpha': 0.5} else: assert params == 'per_fold' if method == 'predict_proba': if cross_fit: est_params = [{ 'C': np.random.uniform() } for i in range(n_folds)] else: est_params = {'C': 1.} else: if cross_fit: est_params = [{ 'alpha': np.random.uniform() } for i in range(n_folds)] else: est_params = {'alpha': 1.} if method == 'predict_proba': preds = _dml_cv_predict(LogisticRegression(), x, y, smpls, est_params=est_params, method=method) preds_ut = _dml_cv_predict_ut_version(LogisticRegression(), x, y, smpls, est_params=est_params, method=method)[:, 1] else: preds = _dml_cv_predict(Lasso(), x, y, smpls, est_params=est_params, method=method) preds_ut = _dml_cv_predict_ut_version(Lasso(), x, y, smpls, est_params=est_params, method=method) res_dict = {'preds': preds, 'preds_ut': preds_ut} return res_dict
# TODO: Add import statements import numpy as np import pandas as pd from sklearn.linear_model import Lasso # Assign the data to predictor and outcome variables # TODO: Load the data train_data = pd.read_csv('data_lasso.csv', header=None) X = train_data.iloc[:, :-1] y = train_data.iloc[:, -1] # TODO: Create the linear regression model with lasso regularization. lasso_reg = Lasso() # TODO: Fit the model. lasso_reg.fit(X, y) # TODO: Retrieve and print out the coefficients from the regression model. reg_coef = lasso_reg.coef_ print(reg_coef)
max_depths = list(range(2, 10 + 1)) + [None] d_max_depths = {'max_depth': max_depths} d_max_depths_base = {'base_estimator__max_depth': max_depths} Ks = {'n_neighbors': [1, 2, 3, 5, 10, 15, 25, 50, 100, 200]} OUTCOME_MODEL_GRID = [ ('LinearRegression', LinearRegression(), {}), ('LinearRegression_interact', make_pipeline(PolynomialFeatures(degree=2, interaction_only=True), LinearRegression()), {}), ('LinearRegression_degree2', make_pipeline(PolynomialFeatures(degree=2), LinearRegression()), {}), # ('LinearRegression_degree3', # make_pipeline(PolynomialFeatures(degree=3), LinearRegression()), {}), ('Ridge', Ridge(), alphas), ('Lasso', Lasso(), alphas), ('ElasticNet', ElasticNet(), alphas), ('KernelRidge', KernelRidge(), alphas), ('SVM_rbf', SVR(kernel='rbf'), d_Cs), ('SVM_sigmoid', SVR(kernel='sigmoid'), d_Cs), ('LinearSVM', LinearSVR(), d_Cs), # (SVR(kernel='linear'), d_Cs), # doesn't seem to work (runs forever) # TODO: add tuning of SVM gamma, rather than using the default "scale" setting # SVMs are sensitive to input scale ('Standardized_SVM_rbf', Pipeline([('standard', StandardScaler()), (SVM, SVR(kernel='rbf'))]), d_Cs_pipeline), ('Standardized_SVM_sigmoid', Pipeline([('standard', StandardScaler()), (SVM, SVR(kernel='sigmoid'))]), d_Cs_pipeline),
def regularize_by_l1(X_train, X_test, y_train, y_test, all_features, N_k, task, N_repeat, seed_no=0): ## 0. Input arguments: # X_train: array that contains training feature data # X_test: array that contains testing feature data # y_train: array that contains traning response data # y_test: array that contains testing response data # all_features: names of all features (column names of X_train) # N_k: number of folds to split into # task: type of supervised learning task: 'regression' or 'classification' # N_repeat: number of independent cross-validation runs, each run will generate one performance score # seed_no: seed number to be used in the first run, 'seed_start + 1' will be used for the second run, ... ## 1. Perform regularized classification/regression based on the specified task # regression if task == 'regression': # split data into K folds kf = KFold(n_splits=N_k, random_state=seed_no, shuffle=True) # find the optimal alpha (regularization factor) using K-fold cross validation on training data cv_regressor = LassoCV(cv=kf, random_state=seed_no) cv_regressor.fit(X_train, y_train) best_alpha = cv_regressor.alpha_ # fit lasso regression using the optimal alpha final_learner = Lasso(alpha=best_alpha) final_learner.fit(X_train, y_train) # obtain selected features by fitted lasso regression model (features with coefficients > 0) select_features = all_features[(final_learner.coef_ != 0).flatten()] N_select = len(select_features) # perform K-fold cross validation to obtain the training performance of fitted lasso regression model train_metric = [] for i in range(0, N_repeat): cv_kf = KFold(n_splits=N_k, random_state=i + 1, shuffle=True) r2 = cross_val_score(final_learner, X_train, y_train, cv=cv_kf, scoring='r2') mse = cross_val_score(final_learner, X_train, y_train, cv=cv_kf, scoring='neg_mean_squared_error') train_metric.append({'r2': np.mean(r2), 'mse': np.mean(mse)}) train_metric_df = pd.DataFrame(train_metric) # implement fitted lasso regression model on the testing set and obtain the testing performance y_pred = final_learner.predict(X_test) test_r2 = r2_score(y_test, y_pred) test_mse = mean_squared_error(y_test, y_pred) test_metric = {'r2': test_r2, 'test_mse': test_mse} # classification if task == 'classification': # straitified split for classification tasks kf = StratifiedKFold(n_splits=N_k, random_state=seed_no, shuffle=True) # find the optimal C (regularization factor) using K-fold cross validation on training data cv_classifier = LogisticRegressionCV(penalty='l1', solver='liblinear', cv=kf, random_state=seed_no) cv_classifier.fit(X_train, y_train) best_c = float(cv_classifier.C_) # fit logistic regression using the optimal C final_learner = LogisticRegression(penalty='l1', solver='liblinear', C=best_c, random_state=seed_no) final_learner.fit(X_train, y_train) # obtain selected features by fitted logistic regression model (features with coefficients > 0) select_features = all_features[(final_learner.coef_ != 0).flatten()] N_select = len(select_features) # perform K-fold cross validation to obtain the training performance of fitted logistic regression model train_metric = [] for i in range(0, N_repeat): cv_kf = StratifiedKFold(n_splits=N_k, random_state=i + 1, shuffle=True) auc = cross_val_score(final_learner, X_train, y_train, cv=cv_kf, scoring='roc_auc') bac = cross_val_score(final_learner, X_train, y_train, cv=cv_kf, scoring='balanced_accuracy') f1 = cross_val_score(final_learner, X_train, y_train, cv=cv_kf, scoring='f1') train_metric.append({ 'auc': np.mean(auc), 'bac': np.mean(bac), 'f1': np.mean(f1) }) train_metric_df = pd.DataFrame(train_metric) # compare with testing response data, compute metrics y_pred_prob = final_learner.predict_proba(X_test)[:, 1] y_pred = final_learner.predict(X_test) test_auc = roc_auc_score(y_test, y_pred_prob) test_bac = balanced_accuracy_score(y_test, y_pred) test_f1 = f1_score(y_test, y_pred) test_metric = {'auc': test_auc, 'bac': test_bac, 'f1': test_f1} return final_learner, select_features, train_metric_df, test_metric
degree = 10 # iter_array=[1000,5000,10000,50000,100000,500000] # train_err = [] # test_err = [] print("Number of Training points are ", train_x.shape[0]) print("Number of Testing points are ", test_x.shape[0]) print("") poly = PolynomialFeatures(degree=degree, include_bias=False) modified_train_x = poly.fit_transform(train_x) modified_test_x = poly.fit_transform(test_x) # for max_iter in iter_array: print("Lasso with default alpha") print("\n") print("\n") reg = Lasso() reg.fit(modified_train_x, train_y) print("Lasso Train RMSE is: ", math.sqrt(mean_squared_error(train_y, reg.predict(modified_train_x)))) print("Lasso Test RMSE is: ", math.sqrt(mean_squared_error(test_y, reg.predict(modified_test_x)))) # train_err.append(math.sqrt(mean_squared_error(train_y,reg.predict(modified_train_x)))) # test_err.append(math.sqrt(mean_squared_error(test_y,reg.predict(modified_test_x)))) # print(reg.coef_) # plt.xlabel('Iterations') # plt.ylabel('RMSE') # plt.plot(iter_array , train_err , 'bo-', label='Training') # plt.plot(iter_array , test_err , 'ro-' , label='Test') # plt.legend() print("\n") print("Polynomial Regression with degree 10")
import pandas as pd from sklearn.linear_model import Lasso import pickle dataset = pd.read_csv('Jan_2019.csv') X = dataset.iloc[0:31, 2:6] y = dataset.iloc[0:31, 6:10] lassoreg = Lasso(alpha=0.1) lassoreg.fit(X, y) pickle.dump(lassoreg, open('model_jan.pkl', 'wb')) dataset = pd.read_csv('Feb_2019.csv') X = dataset.iloc[0:28, 2:6] y = dataset.iloc[0:28, 6:10] lassoreg = Lasso(alpha=0.1) lassoreg.fit(X, y) pickle.dump(lassoreg, open('model_feb.pkl', 'wb')) dataset = pd.read_csv('Mar_2019.csv') X = dataset.iloc[0:31, 2:6] y = dataset.iloc[0:31, 6:10] lassoreg = Lasso(alpha=0.1) lassoreg.fit(X, y) pickle.dump(lassoreg, open('model_mar.pkl', 'wb')) dataset = pd.read_csv('Apr_2019.csv') X = dataset.iloc[0:30, 2:6] y = dataset.iloc[0:30, 6:10] lassoreg = Lasso(alpha=0.1) lassoreg.fit(X, y) pickle.dump(lassoreg, open('model_apr.pkl', 'wb'))
def __init__(self): self.model = Lasso()
def _validate_estimator_params(self, estimator, kwargs): """Validate estimator and parameters inputs Parameters ---------- estimator: str Estimator name kwargs: keyword arguments Grid search named parameters Returns ------- estimator: sklearn.Estimator sklearn estimator, implementing `fit` and `predict` params: dict Grid search params TODO: think about default ranges for grid search """ if not isinstance(estimator, str): raise TypeError('estimator argument must be str, but received %s' % type(estimator)) _estimator = estimator.lower() if _estimator == 'svr': _kernel = kwargs.get('kernel', 'rbf') _C = kwargs.get('C', np.logspace(-4, 4, 5)) _epsilon = kwargs.get('epsilon', np.logspace(-4, 4, 5)) _gamma = kwargs.get('gamma', 'auto') _degree = kwargs.get('degree', 3) return SVR(kernel=_kernel, degree=_degree), { 'C': _C, 'epsilon': _epsilon } if _estimator == 'ridge': _alpha = kwargs.get('alpha', np.logspace(-4, 4, 20)) return Ridge(), {'alpha': _alpha} if _estimator == 'lasso': _alpha = kwargs.get('alpha', np.logspace(-4, 4, 20)) return Lasso(), {'alpha': _alpha} if _estimator == 'lars': _n_nonzero_coefs = kwargs.get('n_nonzero_coefs', np.inf) return Lars(), {'n_nonzero_coefs': _n_nonzero_coefs} if _estimator == 'elasticnet': _alpha = kwargs.get('alpha', np.logspace(-4, 4, 20)) return ElasticNet(), {'alpha': _alpha} if _estimator == 'sgd' or _estimator == 'sgdregressor': _alpha = kwargs.get('alpha', np.logspace(-4, 4, 20)) return SGDRegressor(), {'alpha': _alpha} if _estimator == 'randomforest': _n_estimators = range(5, 30, 5) return RandomForestRegressor(), {'n_estimators': _n_estimators} if _estimator == 'adaboost': _n_estimators = range(10, 60, 5) _learning_rate = np.logspace(-2, 1, 4) return AdaBoostRegressor(), { 'n_estimators': _n_estimators, 'learning_rate': _learning_rate } if _estimator == 'gradientboosting': _n_estimators = range(10, 60, 5) _learning_rate = np.logspace(-2, 1, 4) return GradientBoostingRegressor(), { 'n_estimators': _n_estimators, 'learning_rate': _learning_rate } if _estimator == 'lstm': _layers = kwargs.get('layers', [1, self._order, 2 * self._order, 1]) _pct_dropout = kwargs.get('pct_dropout', 0.5) return LSTM(layers=_layers, pct_dropout=_pct_dropout), {}
from sklearn.linear_model import LinearRegression, Ridge, Lasso from sklearn.tree import DecisionTreeRegressor from sklearn.svm import SVR from sklearn.neighbors import KNeighborsRegressor from mlxtend.regressor import StackingCVRegressor y = dataset['PM'] x = dataset.drop(columns=['PM','CBWD']) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) lr = LinearRegression() dtr = DecisionTreeRegressor() svr_rbf = SVR(kernel='rbf', gamma='auto') knr = KNeighborsRegressor() ridge = Ridge() lasso = Lasso() regression_models = [lr, dtr, svr_rbf, knr, ridge, lasso] sclf = StackingCVRegressor(regression_models, meta_regressor=ridge) sclf.fit(x_train, y_train) pred = sclf.predict(x_test) print(sclf.score(x_train, y_train)) %matplotlib inline plt.scatter([i*10 for i in range(len(y_test))], y_test, c='red', lw=1) plt.plot([i*10 for i in range(len(y_test))], pred, c='black', lw=1) plt.show() `# In[ ]:
# Read the modified dataset y = dataset_oversampling['MWD'] X = dataset_oversampling.drop(['Mn', 'MWD'], axis=1) # Split the data into training and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=32) min_max_scaler = MinMaxScaler() X_train = min_max_scaler.fit_transform(X_train) X_test = min_max_scaler.fit_transform(X_train) # Select the best model parameters through GridSearchCV parameters = { 'alpha':[1e-15,1e-10,1e-8,1e-4,1e-3,1e-2,0.1,1, 5,10,20,100] } las = Lasso() gsearch = GridSearchCV( estimator = las, param_grid = parameters, scoring='neg_mean_squared_error', n_jobs=4, iid=False, cv=5) gsearch.fit(X_train,y_train) print("Best parameters selected: %s"%gsearch.best_params_) # Print the R2 and RMSE values of the predicative model & generate the experimental vs. predicted values into excel files preds = gsearch.predict(X_train) rmse = np.sqrt(mean_squared_error(y_train, preds)) print("Training data RMSE: %f" % (rmse)) r2 = r2_score(y_train, preds) print("Training data R2: %f" % (r2))
if linear_cv_mse == [] or mse1 < min(linear_cv_mse): best_lm_mse = mse1 lr_model_best = lr_model linear_cv_mse.append(mse1) # Train the ridge model and save if it is best model rg_model = Ridge(alpha=20) rg_model.fit(X_train1, y_train1) mse2 = mean_squared_error(y_cv, rg_model.predict(X_cv)) if ridge_cv_mse == [] or mse2 < min(ridge_cv_mse): best_rg_mse = mse2 rg_model_best = rg_model ridge_cv_mse.append(mse2) # Train the Lasso model and save if it is best model lasso_model = Lasso(alpha=20) lasso_model.fit(X_train1, y_train1) mse3 = mean_squared_error(y_cv, lasso_model.predict(X_cv)) if lasso_cv_mse == [] or mse3 < min(lasso_cv_mse): best_lasso_mse = mse3 lasso_model_best = lasso_model lasso_cv_mse.append(mse3) ## Print the MSE for the linear best model from CV print("Best Linear model produced ", best_lm_mse, " MSE on CV") linear_predictions = lr_model_best.predict(X_test) linear_mse.append(mean_squared_error(y_test, linear_predictions)) # Print the MSE for the ridge best model from CV print("Best Ridge model produced ", best_rg_mse, " MSE on CV") ridge_predictions = rg_model_best.predict(X_test)
#线性回归与L2正则化 X, y = mglearn.datasets.load_extended_boston() X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) lr = LinearRegression().fit(X_train, y_train) print(lr.score(X_train, y_train), lr.score(X_test, y_test)) ridge = Ridge(alpha=0.1).fit(X_train, y_train) #L2正则化,alpha为正则化参数,越大则越趋向0,泛化性越强 print(ridge.score(X_train, y_train), ridge.score(X_test, y_test)) mglearn.plots.plot_ridge_n_samples() plt.show() #L1正则化 lasso = Lasso(alpha=0.1).fit(X_train, y_train) print(lasso.score(X_train, y_train), lasso.score(X_test, y_test)) #分类的线性模型 X, y = mglearn.datasets.make_forge() fig, axes = plt.subplots(1, 2, figsize=(10, 3)) for model, ax in zip([LinearSVC(C=1), LogisticRegression(C=1)], axes): #C为正则化参数,C越大正则化越弱 clf = model.fit(X, y) mglearn.plots.plot_2d_separator(clf, X, fill=False, eps=0.5, ax=ax, alpha=0.7)
# TODO: Add import statements import pandas as pd from sklearn.linear_model import Lasso from sklearn.preprocessing import StandardScaler # Assign the data to predictor and outcome variables # TODO: Load the data train_data = pd.read_csv('data.csv', header=None) X = train_data.iloc[:, :-1] y = train_data.iloc[:, -1] # TODO: Create the standardization scaling object. scaler = StandardScaler() # TODO: Fit the standardization parameters and scale the data. X_scaled = scaler.fit_transform(X) # TODO: Create the linear regression model with lasso regularization. lasso_reg = Lasso() # TODO: Fit the model. lasso_reg.fit(X_scaled, y) # TODO: Retrieve and print out the coefficients from the regression model. reg_coef = lasso_reg.coef_ print(reg_coef)
########################################################## # Lasso Regression ########################################################## n_alphas = 100 alphas = np.logspace(-5, 5, 100 ) coefs = list() errors = list() for a in alphas : lasso = Lasso(alpha=a) lasso.fit(X_train, y_train) coefs.append(lasso.coef_) y_pred = lasso.predict(X_test) errors.append(round(rmsle(y_test, y_pred),4)) plt.plot(alphas, errors) plt.plot(alphas, [baseline_error for _ in alphas]) plt.xscale('log') plt.ylim([0.1, 0.3]) plt.show()
ridge_reg = Ridge(alpha=1, solver = "cholesky") ridge_reg.fit(X,y) ridge_reg.predict([[1.5]]) # Using Stochastic Gradient Descent sgd_reg = SGDRegressor(penalty="l2") sgd_reg.fit(X,y.ravel()) sgd_reg.predict([[1.5]]) # Least Absolute Shrinkage and Selection Operator Regression - Lasso Regression # Similar to Ridge regression but instead of using l2 norm , we use l1 norm. # An important characteristic of Lasso regression is that it tends to completely eliminate the weights of the least # important features (i.e. set them to zero). In another words, Lasso regression automatically performs feature Selection # and outputs a sparse model from sklearn.linear_model import Lasso lasso_reg = Lasso(alpha=0.1) lasso_reg.fit(X,y) lasso_reg.predict([[1.5]]) # ElasticNet - Elastic Net is a middle ground between Ridge Regression and Lasso Regression. The regularized term is a simple # mix of both Ridge and Lasso's regularization terms, and you can control the mix ratio r. When r=0, Elastic Net is equivalent # to Ridge Regression and when r = 1 it is equivalent to Lasso Regression # J(theta) = MSE + r * (Lasso regularized term) + ( 1 - r) * (Ridge regularized item) # It is almost always preferrable to have at least a little bit of regularization , so generally you should avoid plain # Linear regression. Ridge is a good default , but if you suspect that only a few features are actually useful, you should # prefer Lasso or Elastic Net since they tend to reduce the useless features' weights down to zero. In general, Elastic Net # is preferred over Lasso since Lasso may behave erratically when the number of features is greater than the number # of training instances or when several features are strongly correlated. # l1_ratio corresponds to the mix ratio r from sklearn.linear_model import ElasticNet elastic_net = ElasticNet(alpha=0.1, l1_ratio = 0.1)
train["SalePrice"] = np.log1p(train["SalePrice"]) numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index skewed_feats = train[numeric_feats].apply( lambda x: skew(x.dropna())) #compute skewness skewed_feats = skewed_feats[skewed_feats > 0.65] skewed_feats = skewed_feats.index all_data[skewed_feats] = boxcox1p(all_data[skewed_feats], 0.14) all_data = pd.get_dummies(all_data) all_data = all_data.fillna(all_data.mean()) X_train = all_data[:train.shape[0]] X_test = all_data[train.shape[0]:] y = train.SalePrice #### models selection lasso = Lasso(alpha=0.0002) model = lasso ### prediction model.fit(X_train, y) preds = np.expm1(model.predict(X_test)) solution = pd.DataFrame({"id": test.Id, "SalePrice": preds}) solution.to_csv("full_features_lasso.csv", index=False)
def set_algorithm(algorithm, *args, **kwargs): """ Setup the algorithm to use in subsequent prediction analyses. Args: algorithm: The prediction algorithm to use. Either a string or an (uninitialized) scikit-learn prediction object. If string, must be one of 'svm','svr', linear','logistic','lasso', 'lassopcr','lassoCV','ridge','ridgeCV','ridgeClassifier', 'randomforest', or 'randomforestClassifier' kwargs: Additional keyword arguments to pass onto the scikit-learn clustering object. Returns: predictor_settings: dictionary of settings for prediction """ # NOTE: function currently located here instead of analysis.py to avoid circular imports predictor_settings = {} predictor_settings['algorithm'] = algorithm def load_class(import_string): class_data = import_string.split(".") module_path = '.'.join(class_data[:-1]) class_str = class_data[-1] module = importlib.import_module(module_path) return getattr(module, class_str) algs_classify = { 'svm': 'sklearn.svm.SVC', 'logistic': 'sklearn.linear_model.LogisticRegression', 'ridgeClassifier': 'sklearn.linear_model.RidgeClassifier', 'ridgeClassifierCV': 'sklearn.linear_model.RidgeClassifierCV', 'randomforestClassifier': 'sklearn.ensemble.RandomForestClassifier' } algs_predict = { 'svr': 'sklearn.svm.SVR', 'linear': 'sklearn.linear_model.LinearRegression', 'lasso': 'sklearn.linear_model.Lasso', 'lassoCV': 'sklearn.linear_model.LassoCV', 'ridge': 'sklearn.linear_model.Ridge', 'ridgeCV': 'sklearn.linear_model.RidgeCV', 'randomforest': 'sklearn.ensemble.RandomForest' } if algorithm in algs_classify.keys(): predictor_settings['prediction_type'] = 'classification' alg = load_class(algs_classify[algorithm]) predictor_settings['predictor'] = alg(*args, **kwargs) elif algorithm in algs_predict: predictor_settings['prediction_type'] = 'prediction' alg = load_class(algs_predict[algorithm]) predictor_settings['predictor'] = alg(*args, **kwargs) elif algorithm == 'lassopcr': predictor_settings['prediction_type'] = 'prediction' from sklearn.linear_model import Lasso from sklearn.decomposition import PCA predictor_settings['_lasso'] = Lasso() predictor_settings['_pca'] = PCA() predictor_settings['predictor'] = Pipeline(steps=[( 'pca', predictor_settings['_pca']), ('lasso', predictor_settings['_lasso'])]) elif algorithm == 'pcr': predictor_settings['prediction_type'] = 'prediction' from sklearn.linear_model import LinearRegression from sklearn.decomposition import PCA predictor_settings['_regress'] = LinearRegression() predictor_settings['_pca'] = PCA() predictor_settings['predictor'] = Pipeline( steps=[('pca', predictor_settings['_pca'] ), ('regress', predictor_settings['_regress'])]) else: raise ValueError("""Invalid prediction/classification algorithm name. Valid options are 'svm','svr', 'linear', 'logistic', 'lasso', 'lassopcr','lassoCV','ridge','ridgeCV','ridgeClassifier', 'randomforest', or 'randomforestClassifier'.""") return predictor_settings
import numpy as np # Regressions from sklearn.linear_model import LinearRegression, Ridge, Lasso, SGDClassifier, SGDRegressor from sklearn.svm import SVR from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor # Classifiers from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier models_regression = [LinearRegression(), Ridge(random_state=42, max_iter=100), Lasso(random_state=42, max_iter=100), SVR(gamma='scale'), AdaBoostRegressor(random_state=42, n_estimators=10), GradientBoostingRegressor(random_state=42, max_depth=3, n_estimators=10), RandomForestRegressor(n_estimators=10, random_state=42, max_depth=3)] models_classification = [LogisticRegression(solver='lbfgs', max_iter=100, random_state=42), SVC(gamma='scale', max_iter=100), AdaBoostClassifier(random_state=42, n_estimators=10), GradientBoostingClassifier(random_state=42, max_depth=3, n_estimators=10), RandomForestClassifier(n_estimators=10, random_state=42, max_depth=3)] def test_sklearn_estimator(): ds = vaex.ml.datasets.load_iris() features = ['sepal_length', 'sepal_width', 'petal_length']
# Generate synthetic images, and projections l = 128 proj_operator = build_projection_operator(l, l / 7.) data = generate_synthetic_data() proj = proj_operator * data.ravel()[:, np.newaxis] proj += 0.15 * np.random.randn(*proj.shape) # Reconstruction with L2 (Ridge) penalization rgr_ridge = Ridge(alpha=0.2) rgr_ridge.fit(proj_operator, proj.ravel()) rec_l2 = rgr_ridge.coef_.reshape(l, l) # Reconstruction with L1 (Lasso) penalization # the best value of alpha was determined using cross validation # with LassoCV rgr_lasso = Lasso(alpha=0.001) rgr_lasso.fit(proj_operator, proj.ravel()) rec_l1 = rgr_lasso.coef_.reshape(l, l) plt.figure(figsize=(8, 3.3)) plt.subplot(131) plt.imshow(data, cmap=plt.cm.gray, interpolation='nearest') plt.axis('off') plt.title('original image') plt.subplot(132) plt.imshow(rec_l2, cmap=plt.cm.gray, interpolation='nearest') plt.title('L2 penalization') plt.axis('off') plt.subplot(133) plt.imshow(rec_l1, cmap=plt.cm.gray, interpolation='nearest') plt.title('L1 penalization')
def test_binary_treatments(self): np.random.seed(123) # Generate data with binary treatments log_odds = np.dot(TestOrthoForest.W[:, TestOrthoForest.support], TestOrthoForest.coefs_T) + \ TestOrthoForest.eta_sample(TestOrthoForest.n) T_sigmoid = 1 / (1 + np.exp(-log_odds)) T = np.array([np.random.binomial(1, p) for p in T_sigmoid]) TE = np.array([self._exp_te(x) for x in TestOrthoForest.X]) Y = np.dot(TestOrthoForest.W[:, TestOrthoForest.support], TestOrthoForest.coefs_Y) + \ T * TE + TestOrthoForest.epsilon_sample(TestOrthoForest.n) # Instantiate model with default params. Using n_jobs=1 since code coverage # does not work well with parallelism. est = DROrthoForest(n_trees=10, n_jobs=1, propensity_model=LogisticRegression(), model_Y=Lasso(), propensity_model_final=LogisticRegressionCV( penalty='l1', solver='saga'), model_Y_final=WeightedLassoCVWrapper()) # Test inputs for binary treatments # --> Check that one can pass in regular lists est.fit(list(Y), list(T), X=list(TestOrthoForest.X), W=list(TestOrthoForest.W)) # --> Check that it fails correctly if lists of different shape are passed in self.assertRaises(ValueError, est.fit, Y[:TestOrthoForest.n // 2], T[:TestOrthoForest.n // 2], TestOrthoForest.X, TestOrthoForest.W) # --> Check that it works when T, Y have shape (n, 1) est.fit(Y.reshape(-1, 1), T.reshape(-1, 1), X=TestOrthoForest.X, W=TestOrthoForest.W) # --> Check that it fails correctly when T has shape (n, 2) self.assertRaises(ValueError, est.fit, Y, np.ones((TestOrthoForest.n, 2)), TestOrthoForest.X, TestOrthoForest.W) # --> Check that it fails correctly when the treatments are not numeric self.assertRaises(ValueError, est.fit, Y, np.array(["a"] * TestOrthoForest.n), TestOrthoForest.X, TestOrthoForest.W) # Check that outputs have the correct shape out_te = est.const_marginal_effect(TestOrthoForest.x_test) self.assertSequenceEqual((TestOrthoForest.x_test.shape[0], 1, 1), out_te.shape) # Test binary treatments with controls est = DROrthoForest(n_trees=100, min_leaf_size=10, max_depth=30, subsample_ratio=0.30, bootstrap=False, n_jobs=1, propensity_model=LogisticRegression(C=1 / 0.024, penalty='l1', solver='saga'), model_Y=Lasso(alpha=0.024), propensity_model_final=LogisticRegressionCV( penalty='l1', solver='saga'), model_Y_final=WeightedLassoCVWrapper()) est.fit(Y, T, X=TestOrthoForest.X, W=TestOrthoForest.W, inference="blb") self._test_te(est, TestOrthoForest.expected_exp_te, tol=0.7, treatment_type='discrete') self._test_ci(est, TestOrthoForest.expected_exp_te, tol=1.5, treatment_type='discrete') # Test binary treatments without controls log_odds = TestOrthoForest.eta_sample(TestOrthoForest.n) T_sigmoid = 1 / (1 + np.exp(-log_odds)) T = np.array([np.random.binomial(1, p) for p in T_sigmoid]) Y = T * TE + TestOrthoForest.epsilon_sample(TestOrthoForest.n) est.fit(Y, T, X=TestOrthoForest.X, inference="blb") self._test_te(est, TestOrthoForest.expected_exp_te, tol=0.5, treatment_type='discrete') self._test_ci(est, TestOrthoForest.expected_exp_te, tol=1.5, treatment_type='discrete')
def trainModel(train, out_p=None, method="ET", is_regr=False, logger=None): """ Train a regression or classification model F such that Y=F(X) Input: train (dictionary): the training data that looks like {"X": df_X, "Y": df_Y, "C": df_C} ...train["X"] is the feature, output from the computeFeatures() function in computeFeatures.py ...train["Y"] is the response, ouput from the computeFeatures() function in computeFeatures.py ...train["C"] is the crowd information, output from the computeFeatures() function also out_p (str): the path for saving the trained model (optional) method (str): the method for training the model is_regr (bool): regression or classification (see computeFeatures.py) logger: the python logger created by the generateLogger() function Output: model: the trained machine learning model """ log("Training model with " + str(train["X"].shape[1]) + " features...", logger) # Build model multi_output = bool(len(train["Y"]) > 1 and train["Y"].shape[1] > 1) if is_regr: if method == "RF": model = RandomForestRegressor(n_estimators=200, max_features=90, min_samples_split=2, n_jobs=-1) elif method == "ET": model = ExtraTreesRegressor(n_estimators=200, max_features=180, min_samples_split=32, n_jobs=-1) elif method == "SVM": model = SVR(max_iter=1000, C=100, gamma=0.01) if multi_output: model = MultiOutputRegressor(model, n_jobs=-1) elif method == "RLR": model = HuberRegressor(max_iter=1000) if multi_output: model = MultiOutputRegressor(model, n_jobs=-1) elif method == "LR": model = LinearRegression() if multi_output: model = MultiOutputRegressor(model, n_jobs=-1) elif method == "EN": model = ElasticNet(alpha=0.1, l1_ratio=0.5, max_iter=1000) if multi_output: model = MultiOutputRegressor(model, n_jobs=-1) elif method == "LA": model = Lasso(alpha=0.01, max_iter=1000) if multi_output: model = MultiOutputRegressor(model, n_jobs=-1) elif method == "MLP": model = MLPRegressor(hidden_layer_sizes=(128, 64)) elif method == "KN": model = KNeighborsRegressor(n_neighbors=10, weights="uniform") elif method == "DT": model = DecisionTreeRegressor() else: m = method[:2] if m in ["RF", "ET"]: # parse tuning parameters p = method.split("-") log( p[0] + ", n_estimators=" + p[1] + ", max_features=" + p[2] + ", min_samples_split=" + p[3], logger) for i in range(1, len(p)): if p[i] == "None": p[i] = None elif p[i] == "auto": p[i] = "auto" else: p[i] = int(p[i]) if m == "RF": model = RandomForestRegressor(n_estimators=p[1], max_features=p[2], min_samples_split=p[3], random_state=0, n_jobs=-1) elif m == "ET": model = ExtraTreesRegressor(n_estimators=p[1], max_features=p[2], min_samples_split=p[3], random_state=0, n_jobs=-1) else: log("ERROR: method " + method + " is not supported", logger) return None else: if method == "RF": model = RandomForestClassifier(n_estimators=1000, max_features=30, min_samples_split=2, n_jobs=-1) elif method == "ET": model = ExtraTreesClassifier(n_estimators=1000, max_features=60, min_samples_split=32, n_jobs=-1) elif method == "SVM": model = SVC(max_iter=5000, kernel="rbf", probability=True) elif method == "MLP": model = MLPClassifier(hidden_layer_sizes=(128, 64)) elif method == "KN": model = KNeighborsClassifier(n_neighbors=10, weights="uniform") elif method == "LG": model = LogisticRegression(penalty="l1", C=1) elif method == "HCR": model = ExtraTreesClassifier(n_estimators=1000, max_features=90, min_samples_split=32, n_jobs=-1) model = HybridCrowdClassifier(base_estimator=model, logger=logger) elif method == "CR": model = HybridCrowdClassifier(logger=logger) elif method == "DT": model = DecisionTreeClassifier(min_samples_split=20, max_depth=8, min_samples_leaf=5) elif method == "Base1": model = DummyClassifier(strategy="stratified") elif method == "Base2": model = DummyClassifier(strategy="uniform") elif method == "Base3": model = DummyClassifier(strategy="constant", constant=1) else: m = method[:2] if m in ["RF", "ET"]: # parse tuning parameters p = method.split("-") log( p[0] + ", n_estimators=" + p[1] + ", max_features=" + p[2] + ", min_samples_split=" + p[3], logger) for i in range(1, len(p)): if p[i] == "None": p[i] = None elif p[i] == "auto": p[i] = "auto" else: p[i] = int(p[i]) if m == "RF": model = RandomForestClassifier(n_estimators=p[1], max_features=p[2], min_samples_split=p[3], random_state=0, n_jobs=-1) elif m == "ET": model = ExtraTreesClassifier(n_estimators=p[1], max_features=p[2], min_samples_split=p[3], random_state=0, n_jobs=-1) else: log("ERROR: method " + method + " is not supported", logger) return None X, Y = copy.deepcopy(train["X"]), copy.deepcopy(train["Y"]) # For one-class classification task, we only want to use the minority class (because we are sure that they are labeled) if not is_regr and method == "IF": y_minor = findLeastCommon(Y) select_y = (Y == y_minor) X, Y = X[select_y], Y[select_y] # Fit data to the model model.fit(X, np.squeeze(Y)) # Save and return model if out_p is not None: joblib.dump(model, out_p) log("Model saved at " + out_p, logger) return model
X, Y = make_friedman1(n_samples=500, n_features=5) ########################### # On représente ces données. fig = plt.figure(figsize=(5, 5)) ax = plt.subplot() ax.plot(X[:, 0], Y, '.') ########################## # On choisira un modèle de régression linéaire # avec une contrainte sur les coefficients # `Lasso <http://scikit-learn.org/stable/modules/ # generated/sklearn.linear_model.Lasso.html>`_. reglin = Lasso() reglin.fit(X, Y) ############################## # L'optimisation du modèle produit une droite # dont les coefficients sont : print(reglin.coef_, reglin.intercept_) ############################### # On reprend le premier graphe est on y ajoute # la droite qui correspond à la régression linéaire # uniquement sur la première dimension. reglin = Lasso() reglin.fit(X[:, :1], Y)
plt.ylabel('回归系数'); # # 六,lasso # In[64]: #lasso是在linear_model下 from sklearn.linear_model import Lasso # In[65]: las = Lasso(alpha = 0.05) #alpha为惩罚系数,值越大惩罚力度越大 las.fit(aba.iloc[:, :-1], aba.iloc[:, -1]) # In[67]: las.coef_ # In[68]: def regularize(xMat,yMat): inxMat = xMat.copy() #数据拷贝 inyMat = yMat.copy()