def full_weighted_cv(X, y, Ds, lambda_gtv=np.linspace(.1, 1, 10), lambda_lasso=None, t=50, auto_cv=True, alpha=.9, k=5): errors = [] X_train, X_test, y_train, y_test = temporal_split(X, y, t) if alpha<1: n = X_train.shape[0] weights = np.array([alpha**(n-t) for t in np.arange(1, n+1)]) X_train = X_train * np.sqrt(weights.reshape(-1,1)) y_train = y_train * np.sqrt(weights) n,p = X_train.shape # test errors for l1 in lambda_gtv: for m in Ds: D = Ds[m] if auto_cv: XD, bigY, invD = augmented_system_lasso(X_train, y_train, D, l1, 0, l1_only=True) fit = cvglmnet(x = XD, y = bigY, family = 'gaussian', ptype = 'mse', nfolds = 5) b = cvglmnetCoef(fit, s = 'lambda_min') l3 = fit['lambda_min'][0] beta = [email protected](b.shape[0])[1:] mset, r2t = compute_errors(y_train, X_train@beta) mse, r2 = compute_errors(y_test, X_test@beta) errors.append([m, l1, l3, mset, r2t, mse, r2]) else: for l3 in lambda_lasso: XD, bigY, invD = augmented_system_lasso(X_train, y_train, D, l1/l3, 0, l1_only=True) #XD, bigY, invD = epsilon_system_lasso(X_train, y_train, D, l1) fit = glmnet(x = XD, y = bigY) b = glmnetCoef(fit, s = scipy.float64([l3]), exact = False) beta = [email protected](b.shape[0])[1:] mset, r2t = compute_errors(y_train, X_train@beta) mse, r2 = compute_errors(y_test, X_test@beta) errors.append([m, l1, l3, mset, r2t, mse, r2]) df = pd.DataFrame(errors, columns=['method', 'lambda_tv', 'lambda_1', 'train_mse', 'train_r2', 'test_mse', 'test_r2']) return df
def extract_rules(self, var_name_list=None, lambda_n=None, return_coefs=True): """ Extract all rules for selected model (best_lambda should is used if lanbda_n = None, call with self.lasso_bset_lambda) return as a list of string INPUT : lambda_n : int or None, index of lambda selected. If none, the self.lasso_best_lambda is used. var_name_list : initial variable names. return_coefs : bool, if True, a dataframe with corresponding coefficients is returned. RETURN : String list, with rules """ # If the index of lambda is given, we extract rules from corresponding lasso model # Else it is the rules of best_lambda sel_lambda = lambda_n if sel_lambda is None: sel_lambda = self.lasso_best_lambda # Conpute the variables where the LASSO coefficient is not null. # list of couples of variables ( ex : ['(0,1)','0', ... , '(12,14)'] ) sel_cols = np.argwhere( glmnetCoef(self.lasso_mod)[1:, sel_lambda] != 0 )[:, 0] #start from 1 because the first coef. corresponds to the intercept list_var_string = list(self.sel_var_names[sel_cols]) #Compute rules rules = [] for var_string in list_var_string: rule = self.var_to_string(var_string, var_name_list) rules.append(rule) if return_coefs: coefs = glmnetCoef(self.lasso_mod)[1:, sel_lambda][(sel_cols)] # return rules,coefs # print(coefs) # print(len(rules)) rules = pd.DataFrame(np.array([rules, coefs]).T, columns=['Rules', 'Lasso Coef.']) rules = rules.astype({'Lasso Coef.': 'float'}) return rules
def weighted_gtv(X, y, D, l1, l3, alpha=.9): if alpha<1: n = X.shape[0] weights = np.array([alpha**(n-t) for t in np.arange(1, n+1)]) X = X * np.sqrt(weights.reshape(-1,1)) y = y * np.sqrt(weights) XD, bigY, invD = augmented_system_lasso(X, y, D, l1/l3, 0, l1_only=True) fit = glmnet(x = XD, y = bigY) b = glmnetCoef(fit, s = scipy.float64([l3]), exact = False) beta = [email protected](b.shape[0])[1:] return beta
def lassotrans(beta, w, y, omega, lam, eta, offset): n = y.shape[0] expterm = np.exp(np.matmul(w, beta)-offset - np.matmul(np.matmul(np.transpose(beta), omega), beta)/2) omegabeta = np.matmul(omega, beta) omegabeta = omegabeta[np.newaxis, :] dLbeta = -(np.matmul(np.transpose(y), w) - np.matmul(np.transpose(expterm), (w - omegabeta)))/n Y = p * np.sqrt(eta/2) * (beta - dLbeta/(eta)) X = p * np.eye(p) * np.sqrt(eta/2) fit = glmnet(x = scipy.float64(X), y = scipy.float64(Y), lambdau = scipy.float64([lam]), intr = False) beta = np.array(glmnetCoef(fit, s= scipy.float64([0])))[1:, 0]#clf.coef_ return beta
def elastic_coxph(x, surv, cen, x_names, alp=False, lam=False): y = np.stack([surv, 1 - cen], axis=1) lam = [0.1,0.5,1] alp = [0.1,0.5,1] result = [] for a in tqdm(alp): fit = glmnet.glmnet(x=x.copy(), y=y.copy(), family='cox', alpha=a, nlambda=100) for l in lam: beta = glmnetCoef.glmnetCoef(fit, s=scipy.float64([l]), exact=False) beta = beta.flatten() try: features = x_names[np.array([int(i) for i, e in enumerate(beta) if e != 0])] except: features = [] result.append((a, l, features, beta)) return result
def cvglmnetCoef(obj, s=None): if s is None or len(s) == 0: s = obj['lambda_1se'] if isinstance(s, scipy.ndarray): lambdau = s elif isinstance(s, str): sbase = ['lambda_1se', 'lambda_min'] indxtf = [x.startswith(s.lower()) for x in sbase] # find index of family in fambase sind= [i for i in range(len(indxtf)) if indxtf[i] == True] s = sbase[sind[0]] lambdau = obj[s] else: raise ValueError('Invalid form of s') result = glmnetCoef(obj['glmnet_fit'], lambdau) return result
def fit(self, train_x, train_y, feature_names): self._feature_names = feature_names print("Start fitting LDA...") tic = time.time() self.lda = LatentDirichletAllocation(n_components=self.n_topics, learning_method='online', \ random_state=self.seed, n_jobs=8) thetas = self.lda.fit_transform( train_x ) # train_x: n_samples * n_features --> thetas: n_samples * n_topics toc = time.time() print("Finish fitting LDA... time spent {} seconds.".format(toc - tic)) # Find beta. Modified from George's demo. print("Start fitting CoxPH...") tic = time.time() fit = glmnet( x=thetas.copy(), y=train_y.copy(), family='cox', alpha=self._alpha, standardize=False, # we performed our own standardization intr=False) self.beta = glmnetCoef(fit, s=np.array([self._lambda])).flatten() toc = time.time() print("Finish fitting CoxPH... time spent {} seconds.".format(toc - tic)) observed_times = train_y[:, 0] event_indicators = train_y[:, 1] # For each observed time, how many times the event occurred event_counts = Counter() for t, r in zip(observed_times, event_indicators): event_counts[t] += int(r) # Sorted list of observed times self.sorted_unique_times = np.sort(list(event_counts.keys())) self.num_unique_times = len(self.sorted_unique_times) self.log_baseline_hazard = np.zeros(self.num_unique_times)
def lasso_coef(self): res = glmnetCoef(self.lasso_mod) if self.lasso_best_lambda is not None: res = res[:, self.lasso_best_lambda] return res
t = scipy.ones((50, 1), dtype=scipy.float64) wts = scipy.row_stack((t, 2 * t)) # call glmnet fit = glmnet.glmnet(x = x.copy(), y = y.copy(), family = 'gaussian', \ weights = wts, \ alpha = 0.2, nlambda = 20 ) glmnetPrint.glmnetPrint(fit) glmnetPlot.glmnetPlot(fit, xvar='lambda', label=True) glmnetPlot.glmnetPlot(fit, xvar='dev', label=True) # any(fit['lambdau'] == 0.5) # coefApprx = glmnetCoef.glmnetCoef(fit, s=scipy.float64([0.5]), exact=False) print(coefApprx) # fc = glmnetPredict.glmnetPredict(fit, x[0:5,:], ptype = 'response', \ s = scipy.float64([0.05])) print(fc) # cvfit = cvglmnet.cvglmnet(x=x.copy(), y=y.copy(), ptype='mse', nfolds=20) cvfit['lambda_min'] cvglmnetCoef.cvglmnetCoef(cvfit, s='lambda_min') #%% cvglmnetPredict.cvglmnetPredict(cvfit, newx=x[0:5, ], s='lambda_min') #%% foldid = scipy.random.choice(10, size=y.shape[0], replace=True)
# Reweighting columns by coeff. randomization Xmod = np.zeros((M_tmp, N)) Xmod[:, w_on] = w * Xbs[:, w_on] Xmod[:, w_off] = Xbs[:, w_off] fit = glmnet(x=Xmod, y=Ybs, family='gaussian', alpha=1.0, maxit=10**8, intr=False, standardize=False, thresh=1e-10, lambdau=np.array([0.02, lambda1])) glmnet_ret = glmnetCoef(fit) betaV[w_on, nexp] = w * glmnet_ret[:, 1][1:NEXP + 1][w_on] betaV[w_off, nexp] = glmnet_ret[:, 1][1:NEXP + 1][w_off] endTime = time.time() t2 = endTime - startTime print([t1, t2]) # elapsed time # Mean value of beta plt.figure(1) _, NEXP = betaV.shape plt.scatter(fit_AMPR_beta, np.mean(betaV[:, 0:NEXP], 1), color='blue',
importlib.reload(cvglmnetPlot) importlib.reload(cvglmnetPredict) # parameters baseDataDir = '../data/' # load data x = scipy.loadtxt(baseDataDir + 'PoissonExampleX.dat', dtype=scipy.float64, delimiter=',') y = scipy.loadtxt(baseDataDir + 'PoissonExampleY.dat', dtype=scipy.float64, delimiter=',') # call glmnet fit = glmnet.glmnet(x=x.copy(), y=y.copy(), family='poisson') glmnetPlot.glmnetPlot(fit) glmnetCoef.glmnetCoef(fit, s=scipy.float64([1.0])) f = glmnetPredict.glmnetPredict(fit, x[0:5, :], ptype='response', s=scipy.float64([0.1, 0.01])) print(f) cvfit = cvglmnet.cvglmnet(x.copy(), y.copy(), family='poisson') optlam = scipy.array([cvfit['lambda_min'], cvfit['lambda_1se']]).reshape(2, ) cvglmnetCoef.cvglmnetCoef(cvfit, s=optlam)
for i in range(int(len(test_sample) / chunksize)): #looping avoids MemoryError predictions[(i * chunksize):((i + 1) * chunksize), :] = glmnetPredict( fit, all_counts[test_sample[(i * chunksize):((i + 1) * chunksize)], :], ptype='response') predictions[((i + 1) * chunksize):, :] = glmnetPredict( fit, all_counts[test_sample[((i + 1) * chunksize):], :], ptype='response') for i in range(num_s): test_auc = metrics.roc_auc_score( pt_data['notes'].loc[test_sample].surv_12mo, predictions[:, i]) print(i, fit['lambdau'][i], fit['df'][i], test_auc) best_lambda_i = 30 coefs = glmnetCoef(fit, s=scipy.float64([fit['lambdau'][best_lambda_i] ]))[1:].flatten() features = pd.DataFrame({ 'feature': count_vect.get_feature_names(), 'coef': coefs }) features.loc[:, 'coef_abs'] = abs(features.coef) features_sorted = features.sort_values(by='coef_abs', ascending=False) features_sorted = features_sorted.reset_index() selected_terms = features.feature.loc[abs(coefs) > 0].tolist() selected_terms_frame = pd.DataFrame({'feature': selected_terms}) selected_terms_frame.loc[:, 'exclude'] = 0 selected_terms_frame.to_csv(output_dir + 'models/' + model_config['model_name'] + '/text_features_unedited.csv', index=False)
glmnetPredict(fit1, scipy.empty([0]), scipy.empty([0]), 'coefficients')) fit2 = glmnet.glmnet(x=x.copy(), y=g2.copy(), family='binomial') print(glmnetPredict(fit2, x[2:5, :], scipy.empty([0]), 'response')) print(glmnetPredict(fit2, scipy.empty([0]), scipy.empty([0]), 'nonzero')) fit3 = glmnet.glmnet(x=x.copy(), y=g4.copy(), family='multinomial') print(glmnetPredict(fit3, x[0:3, :], scipy.array([0.01]), 'response')) print(glmnetPredict(fit3, x[0:3, :], scipy.array([0.01, 0.5]), 'response')) elif section == 8: x = scipy.random.rand(100, 20) y = scipy.random.rand(100, 1) fit = glmnet.glmnet(x=x.copy(), y=y.copy()) ncoef = glmnetCoef(fit, scipy.array([0.01, 0.001])) elif section == 9: scipy.random.seed(1) x = scipy.random.normal(size=(100, 20)) y = scipy.random.normal(size=(100, 1)) g2 = scipy.random.choice(2, size=(100, 1)) * 1.0 g4 = scipy.random.choice(4, size=(100, 1)) * 1.0 plt.figure() fit1 = cvglmnet(x=x.copy(), y=y.copy()) cvglmnetPlot(fit1) plt.figure() fit2 = cvglmnet(x=x.copy(), y=g2.copy(), family='binomial') cvglmnetPlot(fit2)
def fit(self, train_x, train_y, feature_names, duration_col='LOS', event_col='OUT'): """ Given the train dataset, we firstly use glmnet to find the beta (for regression). Then we calculate the log baseline hazard (implemented by George, modified by Ren). :param train_df: DataFrame, with the duration and the event column :param duration_col: the column name for duration :param event_col: the column name for event """ train_df = pd.DataFrame(data=train_x, columns=feature_names) train_df[duration_col] = train_y[:, 0] train_df[event_col] = train_y[:, 1] self._feature_names = feature_names self._duration_col = duration_col self._event_col = event_col train_df = self._standardize_df(train_df, flag='train') train_y = train_df[[duration_col, event_col]].values train_x = train_df.drop(columns=[duration_col, event_col]).values # Find beta. Modified from George's demo. fit = glmnet( x=train_x.copy(), y=train_y.copy(), family='cox', alpha=self._alpha, standardize=False, # we performed our own standardization intr=False) self.beta = glmnetCoef(fit, s=np.array([self._lambda])).flatten() # self.beta = cph_kera(x = train_x.copy(), y = train_y.copy(), \ # alpha = self._alpha, lmbda = self._lambda, standardize = True) observed_times = train_y[:, 0] event_indicators = train_y[:, 1] # For each observed time, how many times the event occurred event_counts = Counter() for t, r in zip(observed_times, event_indicators): event_counts[t] += int(r) # Sorted list of observed times self.sorted_unique_times = np.sort(list(event_counts.keys())) self.num_unique_times = len(self.sorted_unique_times) self.log_baseline_hazard = np.zeros(self.num_unique_times) # Calculate the log baseline hazard. Implemented by George. for time_idx, t in enumerate(self.sorted_unique_times): logsumexp_args = [] for subj_idx, observed_time in enumerate(observed_times): if observed_time >= t: logsumexp_args.append( np.inner(self.beta, train_x[subj_idx])) if event_counts[t] > 0: self.log_baseline_hazard[time_idx] = \ np.log(event_counts[t]) - logsumexp(logsumexp_args) else: self.log_baseline_hazard[time_idx] = \ -np.inf - logsumexp(logsumexp_args)
print(glmnetPredict(fit1, x[0:5, :], np.array([0.01, 0.005]))) print(glmnetPredict(fit1, np.empty([0]), np.empty([0]), 'coefficients')) fit2 = glmnet.glmnet(x=x.copy(), y=g2.copy(), family='binomial') print(glmnetPredict(fit2, x[2:5, :], np.empty([0]), 'response')) print(glmnetPredict(fit2, np.empty([0]), np.empty([0]), 'nonzero')) fit3 = glmnet.glmnet(x=x.copy(), y=g4.copy(), family='multinomial') print(glmnetPredict(fit3, x[0:3, :], np.array([0.01]), 'response')) print(glmnetPredict(fit3, x[0:3, :], np.array([0.01, 0.5]), 'response')) elif section == 8: x = np.random.rand(100, 20) y = np.random.rand(100, 1) fit = glmnet.glmnet(x=x.copy(), y=y.copy()) ncoef = glmnetCoef(fit, np.array([0.01, 0.001])) elif section == 9: np.random.seed(1) x = np.random.normal(size=(100, 20)) y = np.random.normal(size=(100, 1)) g2 = np.random.choice(2, size=(100, 1)) * 1.0 g4 = np.random.choice(4, size=(100, 1)) * 1.0 plt.figure() fit1 = cvglmnet(x=x.copy(), y=y.copy()) cvglmnetPlot(fit1) plt.figure() fit2 = cvglmnet(x=x.copy(), y=g2.copy(), family='binomial') cvglmnetPlot(fit2)
importlib.reload(cvglmnetPlot) importlib.reload(cvglmnetPredict) # parameters baseDataDir = '../data/' # load data x = np.loadtxt(baseDataDir + 'PoissonExampleX.dat', dtype=np.float64, delimiter=',') y = np.loadtxt(baseDataDir + 'PoissonExampleY.dat', dtype=np.float64, delimiter=',') # call glmnet fit = glmnet.glmnet(x=x.copy(), y=y.copy(), family='poisson') glmnetPlot.glmnetPlot(fit) glmnetCoef.glmnetCoef(fit, s=np.float64([1.0])) f = glmnetPredict.glmnetPredict(fit, x[0:5, :], ptype='response', s=np.float64([0.1, 0.01])) print(f) cvfit = cvglmnet.cvglmnet(x.copy(), y.copy(), family='poisson') optlam = np.array([cvfit['lambda_min'], cvfit['lambda_1se']]).reshape(2, ) cvglmnetCoef.cvglmnetCoef(cvfit, s=optlam)
y, X = dmatrices('price ~' + 'yearOfRegistration+powerPS+kilometer+C(notRepairedDamage)+C(fuelType)+C(gearbox)+C(vehicleType)+C(brand)+C(model)', df_categorical, return_type='dataframe') min_max_scaler_x1 = preprocessing.MinMaxScaler() x1 = min_max_scaler_x1.fit_transform(X) min_max_scaler_y1 = preprocessing.MinMaxScaler() y1 = min_max_scaler_y1.fit_transform(y) fit1 = glmnet(x=x1.copy(), y=y1.copy(), family='gaussian', weights=wts, alpha=1, nlambda=100 ) from glmnetCoef import glmnetCoef c = glmnetCoef(fit1) c = c[1:, -1] # remove intercept and get the coefficients at the end of the path import matplotlib.pyplot as plt plt.figure(figsize=(15, 15)) h = glmnetPlot(fit1, xvar='lambda', label=False) # /r/a/p/usr/lib64/python3.5/site-packages/glmnet_python/glmnetPlot.py ax1 = h['ax1'] xloc = plt.xlim() xloc = xloc[0] index = h['index'] xpos = min(index) labels = X.columns.tolist() for i in range(len(c)): ax1.text(1 / 2 * xpos + 1 / 2 * xloc, c[i], labels[i])
importlib.reload(glmnetPlot) importlib.reload(glmnetPrint) importlib.reload(glmnetCoef) importlib.reload(glmnetPredict) importlib.reload(cvglmnet) importlib.reload(cvglmnetCoef) importlib.reload(cvglmnetPlot) importlib.reload(cvglmnetPredict) # parameters baseDataDir = '../data/' # load data x = scipy.loadtxt(baseDataDir + 'CoxExampleX.dat', dtype=scipy.float64, delimiter=',') y = scipy.loadtxt(baseDataDir + 'CoxExampleY.dat', dtype=scipy.float64, delimiter=',') print(y[0:5, :]) # call glmnet fit = glmnet.glmnet(x=x.copy(), y=y.copy(), family='cox') glmnetPlot.glmnetPlot(fit) c = glmnetCoef.glmnetCoef(fit, s=scipy.float64([0.05])) print(c)