def test_1d_multioutput_enet_and_multitask_enet_cv(): X, y, _, _ = build_dataset(n_features=10) y = y[:, np.newaxis] clf = ElasticNetCV(n_alphas=5, eps=2e-3, l1_ratio=[0.5, 0.7]) clf.fit(X, y[:, 0]) clf1 = MultiTaskElasticNetCV(n_alphas=5, eps=2e-3, l1_ratio=[0.5, 0.7]) clf1.fit(X, y) assert_almost_equal(clf.l1_ratio_, clf1.l1_ratio_) assert_almost_equal(clf.alpha_, clf1.alpha_) assert_almost_equal(clf.coef_, clf1.coef_[0]) assert_almost_equal(clf.intercept_, clf1.intercept_[0])
class _MultiTaskElasticNetCVImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X)
def train_glm_model( xtrain: Union[np.ndarray, pd.DataFrame], ytrain: Union[np.ndarray, pd.DataFrame], verbose: int = 0, ) -> BaseEstimator: """Train a basic Generalized Linear Model (GLM) Parameters ---------- xtrain : np.ndarray, pd.DataFrame (n_samples x d_features) input training data ytrain : np.ndarray, pd.DataFrame (n_samples x p_outputs) labeled training data verbose : int, default=0 option to print out training messages Returns ------- gl_model : BaseEstimator the trained model """ # Initialize GLM gl_model = MultiTaskElasticNetCV( alphas=None, cv=3, random_state=123, n_jobs=-1, normalize=False, selection="random", verbose=verbose, ) # train GLM t0 = time.time() gl_model.fit(xtrain, ytrain) t1 = time.time() - t0 if verbose > 0: print(f"Training time: {t1:.3f} secs.") return gl_model
def elastic_net(X,Y): print(X.shape) clf = MultiTaskElasticNetCV(l1_ratio=0.5, eps=0.001, n_alphas=100, alphas=None, fit_intercept=True, normalize=False, max_iter=1000, tol=0.0001, cv=None, copy_X=True, verbose=0, n_jobs=1, random_state=None, selection='cyclic') fit=clf.fit(X,Y) sfm = SelectFromModel(fit,prefit=True) values= SelectFromModel.get_support(sfm,indices=True) new_features = sfm.transform(X) return new_features,values
def train_multi_elasticnet(train_features, train_labels, num_alphas, skip_cross_validation, alpha, l1_ratio, num_jobs): """ Performs the cross validation of multi elastic net model, and returns the trained model with best params. Assume features are scaled/normalized. Assumes train_labels has more than one column. """ best_alpha = alpha best_l1_ratio = l1_ratio max_iter = 10000 tol = 0.0005 if not skip_cross_validation: # use 5 fold cross validation model = MultiTaskElasticNetCV(l1_ratio=[ 0.5, 0.6, 0.7, 0.8, 0.85, 0.9, 0.925, 0.95, 0.975, 0.99, 0.999, 0.9999 ], max_iter=max_iter, cv=5, n_alphas=num_alphas, n_jobs=num_jobs, normalize=False, tol=tol) model.fit(train_features, train_labels) best_alpha = model.alpha_ best_l1_ratio = model.l1_ratio_ #print("number of iterations were {}".format(model.n_iter_)) model = MultiTaskElasticNet(alpha=best_alpha, l1_ratio=best_l1_ratio, normalize=False, max_iter=max_iter, tol=tol) model.fit(train_features, train_labels) return (model, {'alpha': best_alpha, 'l1_ratio': best_l1_ratio})
def test_multitask_enet_and_lasso_cv(): X, y, _, _ = build_dataset(n_features=50, n_targets=3) clf = MultiTaskElasticNetCV(cv=3).fit(X, y) assert_almost_equal(clf.alpha_, 0.00556, 3) clf = MultiTaskLassoCV(cv=3).fit(X, y) assert_almost_equal(clf.alpha_, 0.00278, 3) X, y, _, _ = build_dataset(n_targets=3) clf = MultiTaskElasticNetCV(n_alphas=10, eps=1e-3, max_iter=100, l1_ratio=[0.3, 0.5], tol=1e-3, cv=3) clf.fit(X, y) assert 0.5 == clf.l1_ratio_ assert (3, X.shape[1]) == clf.coef_.shape assert (3, ) == clf.intercept_.shape assert (2, 10, 3) == clf.mse_path_.shape assert (2, 10) == clf.alphas_.shape X, y, _, _ = build_dataset(n_targets=3) clf = MultiTaskLassoCV(n_alphas=10, eps=1e-3, max_iter=100, tol=1e-3, cv=3) clf.fit(X, y) assert (3, X.shape[1]) == clf.coef_.shape assert (3, ) == clf.intercept_.shape assert (10, 3) == clf.mse_path_.shape assert 10 == len(clf.alphas_)
def select_mtelastic(self, X, y): # MultiTaskElasticCV from sklearn used to determine best alpha for Multi-task Elastic-Net Regression. mtlasso_alphas = MultiTaskElasticNetCV(alphas=[ 0.00001, .0001, .001, .002, .003, .004, .005, .006, .007, .008, .009, .099, .01, .011, .012, .013, .014, .015, .016, .017, .018, .019, .02, .025, .026, .027, .028, .029, .03, .031, .032, .033, .034, .035, .036, .037, .038, .039, .04, .041, .042, .043, .044, .045, .05, .06, .07, .071, .072, .073, .074, .075, .076, .077, .078, .079, .08, .1, .2, .225, .23, .24, .245, .246, .247, .248, .249, .25, .251, .252, .253, .254, .255, .26, .27, .275, .3, .35, .4, .45, .46, .47, .48, .481, .482, .483, .484, .485, .486, .487, .488, .489, .49, .491, .492, .493, .494, .495, .496, .497, .498, .499, .5, .51, .511, .512, .513, .514, .515, .516, .517, .518, .519, .52, .525, .53, .54, .55, .6, .75, .752, .7527, .7528, .7529, .753, .7531, .754, .7545, .755, .756, .76, .765, .77, .78, .79, .8, .9, 1.0, 1.2, 1.25, 1.5, 1.75, 2.0 ]) sel_alpha = mtlasso_alphas.fit(X, y) sel_alpha.alpha_ print(sel_alpha.alpha_)
#把离散特征和连续特征拼接起来 x_vec = np.concatenate((x_vec_con, x_vec_dis), axis=1) #对于目标进行预测 y_registered = bike_rel['registered'].values.astype(float) y_casual = bike_rel['casual'].values.astype(float) y = np.stack((y_registered, y_casual), axis=1) #建立模型进行预测 from sklearn.linear_model import MultiTaskLassoCV from sklearn.model_selection import train_test_split from sklearn.linear_model import MultiTaskElasticNetCV x1, x2, y1, y2 = train_test_split(x_vec, y, test_size=0.2, random_state=20) ############ Lasso mtl = MultiTaskLassoCV(alphas=np.logspace(-3, -1, 3), cv=8, verbose=3) mtl.fit(x1, y1) mtl.score(x1, y1) mtl.score(x2, y2) ############ ElasticNetCV mte = MultiTaskElasticNetCV(l1_ratio=np.logspace(-3, -1, 3), alphas=np.logspace(-3, -1, 3), cv=8, verbose=3) mte.fit(x1, y1) mtl.score(x1, y1) mtl.score(x2, y2)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=1) folds = 5 alphas = np.logspace(1, 5, 3) l1_ratios = np.linspace(0, 1, 2, endpoint=True) models = MultiTaskElasticNetCV(l1_ratio=l1_ratios, alphas=alphas, verbose=1, cv=folds, n_jobs=-1) models.fit(X_train, Y_train) models.score(X_test, Y_test) print "Alpha: ", models.alpha_ print "L1 ratio: ", models.l1_ratio_ print "Score of Elastic-net on test data: ", models.score(X_test, Y_test) model_EN = ElasticNet(l1_ratio=models.l1_ratio_, alpha=models.alpha_) model_EN.fit(np.concatenate((X_train, X_test)), np.concatenate((Y_train, Y_test))) test = np.rint(models.predict(X_test)).astype('int16') coeff = model_EN.coef_.T # coeff = models.coef_.T # high=1.0
def scorer(pipe, X, y): pred = pipe.predict(X) return metrics.f1_score(y, pred) accum = np.zeros((X.shape[1],)) for y in np.transpose(Y): selector = SelectKBest(f_classif, selectedFeaureNum) selector = selector.fit(X, y) accum += selector.pvalues_ selectedIndices = accum.argsort()[:selectedFeaureNum] def transform(X): return X[:, selectedIndices] X_filtered, X_test_filtered = transform(X), transform(X_test) clf = MultiTaskElasticNetCV(normalize=True) #clf = MultiTaskLasso(normalize=True) clf.fit(X_filtered, Y) predTrain = np.array(clf.predict(X_filtered)) splits = [] for col in range(predTrain.shape[1]): bestSplit, bestF1 = labanUtil.getSplitThreshold(predTrain[:, col], Y[:, col]) splits.append(bestSplit) pred = np.array(clf.predict(X_test_filtered)) for col in range(pred.shape[1]): pred[:, col] = [1 if e>=splits[col] else 0 for e in pred[:, col]] predTrain[:, col] = [1 if e>=splits[col] else 0 for e in predTrain[:, col]] ps.append(metrics.precision_score(Y_test, pred)) rs.append(metrics.recall_score(Y_test, pred)) teF = metrics.f1_score(Y_test, pred) teFs.append(teF) trFs.append(metrics.f1_score(Y, predTrain)) print 'test#: ', test
from sklearn.linear_model import MultiTaskElasticNet, MultiTaskElasticNetCV #cross-validating to find best hyperparams cv_model = MultiTaskElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1], verbose=1) cv_model.fit(X_train, y_train) #fitting model with hyperparameters from above model = MultiTaskElasticNet(alpha=cv_model.alpha_, l1_ratio=cv_model.l1_ratio_, random_state=0) model.fit(X_train, y_train) #predicting preds = model.predict(X_test) test_df[[ 'age', 'domain1_var1', 'domain1_var2', 'domain2_var1', 'domain2_var2' ]] = preds test_df.drop(columns=["is_train"], inplace=True) test_df.head() #predictions housekeeping sub_df = cudf.melt(test_df[[ "Id", "age", "domain1_var1", "domain1_var2", "domain2_var1", "domain2_var2" ]], id_vars=["Id"], value_name="Predicted") sub_df["Id"] = sub_df["Id"].astype("str") + "_" + sub_df["variable"].astype( "str") sub_df = sub_df.drop("variable", axis=1).sort_values("Id") assert sub_df.shape[0] == test_df.shape[0] * 5
def test_enet_path(): # We use a large number of samples and of informative features so that # the l1_ratio selected is more toward ridge than lasso X, y, X_test, y_test = build_dataset(n_samples=200, n_features=100, n_informative_features=100) max_iter = 150 # Here we have a small number of iterations, and thus the # ElasticNet might not converge. This is to speed up tests clf = ElasticNetCV(alphas=[0.01, 0.05, 0.1], eps=2e-3, l1_ratio=[0.5, 0.7], cv=3, max_iter=max_iter) ignore_warnings(clf.fit)(X, y) # Well-conditioned settings, we should have selected our # smallest penalty assert_almost_equal(clf.alpha_, min(clf.alphas_)) # Non-sparse ground truth: we should have selected an elastic-net # that is closer to ridge than to lasso assert clf.l1_ratio_ == min(clf.l1_ratio) clf = ElasticNetCV(alphas=[0.01, 0.05, 0.1], eps=2e-3, l1_ratio=[0.5, 0.7], cv=3, max_iter=max_iter, precompute=True) ignore_warnings(clf.fit)(X, y) # Well-conditioned settings, we should have selected our # smallest penalty assert_almost_equal(clf.alpha_, min(clf.alphas_)) # Non-sparse ground truth: we should have selected an elastic-net # that is closer to ridge than to lasso assert clf.l1_ratio_ == min(clf.l1_ratio) # We are in well-conditioned settings with low noise: we should # have a good test-set performance assert clf.score(X_test, y_test) > 0.99 # Multi-output/target case X, y, X_test, y_test = build_dataset(n_features=10, n_targets=3) clf = MultiTaskElasticNetCV(n_alphas=5, eps=2e-3, l1_ratio=[0.5, 0.7], cv=3, max_iter=max_iter) ignore_warnings(clf.fit)(X, y) # We are in well-conditioned settings with low noise: we should # have a good test-set performance assert clf.score(X_test, y_test) > 0.99 assert clf.coef_.shape == (3, 10) # Mono-output should have same cross-validated alpha_ and l1_ratio_ # in both cases. X, y, _, _ = build_dataset(n_features=10) clf1 = ElasticNetCV(n_alphas=5, eps=2e-3, l1_ratio=[0.5, 0.7]) clf1.fit(X, y) clf2 = MultiTaskElasticNetCV(n_alphas=5, eps=2e-3, l1_ratio=[0.5, 0.7]) clf2.fit(X, y[:, np.newaxis]) assert_almost_equal(clf1.l1_ratio_, clf2.l1_ratio_) assert_almost_equal(clf1.alpha_, clf2.alpha_)
used_mets=[] for mm in g2: reacs=[react_dict[z] for z in mm] m=Model(reacs) used_mets.append(m.ex_reactants) used_mets = list(chain.from_iterable(used_mets)) mf=[] for mm in dm: mf.append(used_mets.count(mm)/len(g2)) true_used_env.append(mf) from sklearn.linear_model import MultiTaskElasticNetCV as EN enet = EN(cv=50, max_iter=100000) x = full_freq_m.T[m_diff_freq_m>.005].T y = used_environment.T[m_diff_used_env>0.005].T mod=enet.fit(x, y) p = mod.predict(f2[m_diff_freq_m>.005].reshape(1,-1)) p=p.flatten() p = p+abs(min(p)) p=p/max(p) c = [sts.pearsonr(mf,used_environment[ee][m_diff_used_env>0.005])[0] for ee in range(len(used_environment))] predicted.append(sts.pearsonr(p, mf)[0]) average.append(mean(c)) predicted_environments.append(p)
# -*- coding: utf-8 -*- """ Created on Thu Apr 21 23:51:12 2016 @author: patanjali """ from sklearn.linear_model import MultiTaskElasticNetCV from utils2 import load_dataset import pandas train, validate, test = load_dataset() no_classes = train[:,0].max()+1 train_y = pandas.get_dummies(train[:,0]) print no_classes, train.shape train = train[:201] validate = validate[:201] test = test[:201] for l1_ratio in [.1, .5, .7, .9, .95, .99, 1]: model = MultiTaskElasticNetCV(l1_ratio=l1_ratio, normalize=True, verbose=True, n_jobs=3) model.fit(train[:,1:], train_y) predicted_classes = (model.predict(validate[:,1:])).argmax(1) correct = sum(predicted_classes==validate[:,0]) print l1_ratio, correct, correct*1.0/validate.shape[0]
# the parameters below are new in sklearn 0.18 feature_names=['petal length', 'petal width'], class_names=['setosa', 'versicolor', 'virginica'], filled=True, rounded=True) graph = pydotplus.graph_from_dot_data(dot_data) display(Image(graph.create_png())) export_graphviz(tree, out_file='tree.dot', feature_names=['petal length', 'petal width']) Image(filename='./images/03_18.png', width=600) from sklearn.datasets import load_iris from sklearn import tree iris = load_iris() clf = tree.DecisionTreeClassifier(criterion='entropy', max_depth=3) clf = clf.fit(iris.data, iris.target) with open("iris.dot", 'w') as f: f = tree.export_graphviz(clf, out_file=f) import pydotplus dot_data = tree.export_graphviz(clf, out_file=None) graph = pydotplus.graph_from_dot_data(dot_data) graph.write_pdf("iris.pdf")
class PhonesthemesModel(object): """ Attributes ---------- self.config: Dict A dictionary of the arguments passed into the object. self.ngrams: List[int] A list of integers that refer to the ngram sizes to use. self.mode: List[str] List of str indicating the positions in the word to use as candidate phonesthemes. Possible elements are "start", "end", and "all". self.min_count: int Minimum number of ngram occurrences in order to be included as a features. self.one_hot: bool Whether or not to use one-hot features instead of counts for the phonestheme ngram features. self.vectors Dictionary of word to vector, where word is either a string or a tuple of strings (phoneme representation). self.phonesthemes_reg The MultiTaskElasticNetCV model fit on the phonestheme feature vectors to predict the phonestheme targets. self.X_ngram The input feature vectors used to fit the Elastic Net. self.ngram_to_idx A mapping from ngram to feature index of X_ngram. self.is_trained A boolean describing whether this model has been trained or not. """ def __init__(self, ngrams, mode, min_count, one_hot): self.config = locals() self.config.pop("self") self.config.pop("__class__", None) logger.info("Config: ") pprint.pprint(self.config) self.ngrams = ngrams self.mode = mode self.min_count = min_count self.one_hot = one_hot # Placeholder values, these get set when we call train self.vectors = None self.phonesthemes_reg = None self.X_ngram = None self.ngram_to_idx = None self.phonemes_to_graphemes = None self.is_trained = False def get_phonesthemes(self): return get_phonesthemes_from_model(self) def train(self, vectors_path, bound_morphemes_path=None, word_segmentations_path=None, graphemes_to_phonemes_path=None, n_jobs=1, l1_ratio=0.5): train_config = locals() train_config.pop("self") train_config.pop("__class__", None) self.config["train_config"] = train_config logger.info("Train config: ") pprint.pprint(train_config) # Load vectors, where the keys can be words represented as # sequences of characters (normal word vectors) or words represented # as sequences of phonemes (phonemicized vectors). logger.info("Reading vectors from {}".format(vectors_path)) self.vectors = OrderedDict() with open(vectors_path) as vectors_file: for line in tqdm(vectors_file, total=get_line_number(vectors_path)): split_line = line.rstrip("\n").split() word = split_line[0] # If we have phonemicized vectors, the keys to the dict are # tuples of comma-separated phonemes representing a word. if graphemes_to_phonemes_path is not None: word = tuple(word.split(",")) embedding = np.array([float(val) for val in split_line[1:]]) self.vectors[word] = embedding # Randomly shuffle the OrderedDict random_seed = 0 logger.info( "Shuffling vectors with random seed {}".format(random_seed)) random.seed(random_seed) vector_items = list(self.vectors.items()) # random.shuffle is in-place random.shuffle(vector_items) self.vectors = OrderedDict(vector_items) vocabulary = list(self.vectors.keys()) targets = np.asarray(list(self.vectors.values())) # Load phonemes to graphemes if we were given g2p data if graphemes_to_phonemes_path: logger.info("Reading graphemes to phonemes data " "from {}".format(graphemes_to_phonemes_path)) self.phonemes_to_graphemes = {} # Load the graphemes to phonemes data with open( graphemes_to_phonemes_path) as graphemes_to_phonemes_file: for line in tqdm( graphemes_to_phonemes_file, total=get_line_number(graphemes_to_phonemes_path)): split_line = line.rstrip("\n").split("\t") word = split_line[0] phonemes = tuple(split_line[1].split(" ")) self.phonemes_to_graphemes[phonemes] = word if bound_morphemes_path is not None: # Load morpheme data if we were given bound morphemes word_segmentations, bound_morphemes = self._load_morpheme_data( word_segmentations_path, bound_morphemes_path) # Update targets with predictions of the morpheme model. This is equivalent # to using the model residuals as the new targets. targets = self._get_morpheme_residuals(vocabulary, targets, bound_morphemes, graphemes_to_phonemes_path, word_segmentations, n_jobs=n_jobs) # Get the ngram features for the vocabulary. self.X_ngram, self.ngram_to_idx = build_ngram_features( vocabulary=vocabulary, one_hot=self.one_hot, ngram_range=self.ngrams, mode=self.mode, freq_thres=self.min_count) logger.info("Shape of ElasticNet input (number of words, " "number of candidate phonesthemes): {}".format( self.X_ngram.shape)) logger.info("Shape of ElasticNet targets (number of words, " "vector dimension): {}".format(targets.shape)) # Fit a MultiTaskElasticNetCV model to extract phonesthemes. logger.info("Fitting MultiTaskElasticNetCV") self.phonesthemes_reg = MultiTaskElasticNetCV(l1_ratio=l1_ratio, n_jobs=n_jobs, random_state=0, cv=5) self.phonesthemes_reg.fit(self.X_ngram, targets) logger.info("Done fitting MultiTaskElasticNetCV") self.is_trained = True def _load_morpheme_data(self, word_segmentations_path, bound_morphemes_path): # Load word segmentations word_segmentations = {} if word_segmentations_path: logger.info("Loading word segmentations from {}".format( word_segmentations_path)) with open(word_segmentations_path) as word_segmentations_file: for line in tqdm( word_segmentations_file, total=get_line_number(word_segmentations_path)): split_line = line.rstrip("\n").split("\t") assert len(split_line) == 2 word = split_line[0] morphemes = split_line[1].split(" ") word_segmentations[word] = morphemes logger.info("Loaded {} word segmentations".format( len(word_segmentations))) # Load the list of bound morphemes logger.info( "Loading bound morphemes from {}".format(bound_morphemes_path)) bound_morphemes = [] with open(bound_morphemes_path) as bound_morphemes_file: for line in tqdm(bound_morphemes_file, total=get_line_number(bound_morphemes_path)): bound_morphemes.append(line.rstrip("\n")) logger.info("Loaded {} bound morphemes".format(len(bound_morphemes))) return (word_segmentations, bound_morphemes) def _get_morpheme_residuals(self, vocabulary, targets, bound_morphemes, graphemes_to_phonemes_path, word_segmentations=None, n_jobs=1): # Get the vectors vocabulary, and convert to string if we are using # phonemicized vectors. if graphemes_to_phonemes_path is None: string_vectors_vocab = vocabulary else: # The vocab of the phonemicized vectors converted to graphemes. string_vectors_vocab = [ self.phonemes_to_graphemes[phonemes] for phonemes in vocabulary ] # Build the morpheme feature vectors. morpheme_features = build_morpheme_features(string_vectors_vocab, bound_morphemes, word_segmentations) logger.info("Input shape for morpheme pretraining linear regression " "(number of words, number of morphemes): {}".format( morpheme_features.shape)) logger.info("Target shape for morpheme pretraining linear regression " "(number of words, vector dimension): {}".format( targets.shape)) morph_reg = LinearRegression(n_jobs=n_jobs) logger.info("Pretraining on morpheme features.") morph_reg = morph_reg.fit(morpheme_features, targets) logger.info("Calculating residuals of of linear regression done " "on morpheme features and using that as the train " "vectors for the ngram feature model.") # Get the residuals of the model for use in the second model. morph_reg_pred_y = morph_reg.predict(morpheme_features) morph_reg_residuals = np.subtract(targets, morph_reg_pred_y) return morph_reg_residuals def __eq__(self, other): # Two PhonesthemesModel objects are the same if their members are # the same. # Compare their ngrams if self.ngrams != other.ngrams: return False # Compare their mode if self.mode != other.mode: return False # Compare their min count if self.min_count != other.min_count: return False # Compare whether they use one-hot or frequency features if self.one_hot != other.one_hot: return False # Compare that they have the same set of vectors in the same order if len(self.vectors) != len(other.vectors): return False for this_word, other_word in zip(self.vectors, other.vectors): if this_word != other_word: return False if not np.allclose(self.vectors[this_word], other.vectors[this_word]): return False # Check that they were trained on the same features if not np.allclose(self.X_ngram, other.X_ngram): return False # Check that they have the same mapping of ngram to feature idx if self.ngram_to_idx != other.ngram_to_idx: return False return True if six.PY2: def __ne__(self, other): equal = self.__eq__(other) return equal if equal is NotImplemented else not equal
def main(family, quantile_ass=.99): data_folder = os.path.join(Path(os.getcwd()).parents[1], 'data') #load a pickle generated from "associate_env.py script" store = pickle.load(open(data_folder + '/pickles/' + family + '.pkl', 'rb')) used_environment = store['used_env'].copy() full_freq_m = store['full_freq_m'].copy() reactome = store['reactome'].copy() model_sample = store['model_sample'].copy() transporter = store['transporter'].copy() #replace nan values by the average av_used_env = np.nanmean(used_environment,0) inds = np.where(np.isnan(used_environment)) used_environment[inds] = np.take(av_used_env, inds[1]) #for reaction frequency env_driv_reac_score = get_residual_scores(full_freq_m) reac_cutoff = np.std(env_driv_reac_score) env_driven_reactome = reactome[env_driv_reac_score>reac_cutoff] reaction_frequency = full_freq_m.T[env_driv_reac_score>reac_cutoff].T clss_freq_m = get_residuals(reaction_frequency) #for the environment env_driv_met_score = get_residual_scores(used_environment) met_cutoff = np.std(env_driv_met_score) driving_mets = transporter[env_driv_met_score>met_cutoff] used_env = used_environment.T[env_driv_met_score>met_cutoff].T clss_used_env = get_residuals(used_env) #regression terms x=reaction_frequency.copy() y = used_env.copy() cosine_dict={} for i, reac in enumerate(clss_freq_m.T): cosine_dict[env_driven_reactome[i]] = np.array([cosine(reac.flatten(), metab.flatten()) for metab in clss_used_env.T]) cosine_v = np.array([cosine_dict[i] for i in envd_reactions]) #find metabolite concentrations for models from sklearn.linear_model import MultiTaskElasticNetCV as EN enet = EN(cv=3,verbose=1, n_jobs=7, max_iter=10000) print(x.shape, y.shape) mod=enet.fit(x, y) evolved_env= np.zeros((len(model_sample), len(dm))) for i,mod_prof in enumerate(model_sample): #print(family, i) v = mod_prof[env_driv_reac_score>0] p = mod.predict(v[s_clss_fm!=0].reshape(1,-1)) p=p.flatten() p = p+abs(min(p)) p=p/max(p) evolved_env[i] =p.copy() #av_mod_diff = np.arctanh(av_mod_diff) met_prof = get_evolved_met_prof(evolved_env, dm, transporter) return transporter, met_prof
def main(family, quantile_ass=.99): data_folder = os.path.join(Path(os.getcwd()).parents[1], 'data') #load a pickle generated from "associate_env.py script" store = pickle.load(open(data_folder + '/pickles/' + family + '.pkl', 'rb')) used_environment = store['used_env'].copy() full_freq_m = store['full_freq_m'].copy() reactome = store['reactome'].copy() model_sample = store['model_sample'].copy() transporter = store['transporter'].copy() #replace nan values by the average av_used_env = np.nanmean(used_environment, 0) inds = np.where(np.isnan(used_environment)) used_environment[inds] = np.take(av_used_env, inds[1]) #for reaction frequency av_freq_m = np.mean(full_freq_m, axis=0) diff_freq_m = full_freq_m - av_freq_m #filter out noise and find reactions that are driven by the environment env_d_score1 = np.round(np.max(diff_freq_m, axis=0), 4) env_d_score1 = env_d_score1 / max(np.abs(env_d_score1)) env_d_score2 = np.round(np.min(diff_freq_m, axis=0), 4) env_d_score2 = env_d_score2 / max(np.abs(env_d_score2)) env_d_score = np.zeros(len(env_d_score1)) for i in range(len(env_d_score1)): if abs(env_d_score2[i]) > abs(env_d_score1[i]): env_d_score[i] = env_d_score2[i] else: env_d_score[i] = env_d_score1[i] m_diff_freq_m = np.abs(env_d_score) env_driven_reactome = reactome #[m_diff_freq_m>.005] diff_freq_m_envd = diff_freq_m.T #[m_diff_freq_m>.005].T reaction_frequency = full_freq_m.T #[m_diff_freq_m>.005].T clss_freq_m = np.zeros(diff_freq_m_envd.shape) for i, v in enumerate(diff_freq_m_envd): clss_freq_m[i] = v #assign_to_rank(v, fpc,fnc) #for the environment av_used_env = np.mean(used_environment, axis=0) diff_used_env = used_environment - av_used_env #filter out noise and find metabolites that are driven by the environment m_diff_used_env = np.max(np.abs(diff_used_env), axis=0) driving_mets = transporter #[m_diff_used_env>0.005] diff_used_env_envd = diff_used_env.T #[m_diff_used_env>0.005].T used_env = used_environment.T #[m_diff_used_env>0.005].T clss_used_env = np.zeros(diff_used_env_envd.shape) for i, v in enumerate(diff_used_env_envd): clss_used_env[i] = v #assign_to_rank(v, epc, enc) s_clss_fm = np.sum(np.abs(clss_freq_m), axis=0) s_clss_ue = np.sum(np.abs(clss_used_env), axis=0) #env_driven_reactome envd_reactions = env_driven_reactome[s_clss_fm != 0] #driving_metabolites dm = driving_mets.copy() dm = dm[s_clss_ue != 0] #profiles envd_prof = clss_freq_m.T[s_clss_fm != 0].T dm_prof = clss_used_env.T[s_clss_ue != 0].T #regression terms x = reaction_frequency.T[s_clss_fm != 0].T y = used_env.T[s_clss_ue != 0].T cosine_dict = {} for i, reac in enumerate(envd_prof.T): cosine_dict[envd_reactions[i]] = np.array( [cosine(reac.flatten(), metab.flatten()) for metab in dm_prof.T]) cosine_pool = np.array(list(cosine_dict.values())).flatten() pc = np.quantile(cosine_pool[cosine_pool > 0], quantile_ass) nc = np.quantile(cosine_pool[cosine_pool < 0], 1 - quantile_ass) association_d = {} for i, reac in enumerate(envd_prof.T): v = cosine_dict[envd_reactions[i]] association_d[envd_reactions[i]] = assign_to_rank(v, pc, nc) g = build_association_network(association_d, envd_reactions, dm) nx.write_graphml( g, os.path.join( Path(os.getcwd()).parents[0], 'files', 'networks', family) + '.graphml') #find metabolite concentrations for models from sklearn.linear_model import MultiTaskElasticNetCV as EN enet = EN(cv=3, verbose=1, n_jobs=7, max_iter=10000) print(x.shape, y.shape) mod = enet.fit(x, y) evolved_env = np.zeros((len(model_sample), len(dm))) for i, mod_prof in enumerate(model_sample): print(family, i) v = mod_prof[m_diff_freq_m > .005] p = mod.predict(v[s_clss_fm != 0].reshape(1, -1)) p = p.flatten() p = p + abs(min(p)) p = p / max(p) evolved_env[i] = p.copy() #av_mod_diff = np.arctanh(av_mod_diff) met_prof = get_evolved_met_prof(evolved_env, dm, transporter) return transporter, met_prof
def train_linear_model(X, y, random_state=1, test_size=0.2, regularization_type='elasticnet', k_fold=5, max_iter=1000000, tol=0.0001, l1_ratio=None): """ Function to train linear model with regularization and cross-validation. Args: X (pandas.DataFrame): dataframe of descriptors. y (pandas.DataFrame): dataframe of cycle lifetimes. random_state (int): seed for train/test split. test_size (float): proportion of the dataset reserved for model evaluation. regularization_type (str): lasso or ridge or elastic-net (with cv). k_fold (int): k in k-fold cross-validation. max_iter (int): maximum number of iterations for model fitting. tol (float): tolerance for optimization. l1_ratio ([float]): list of lasso to ridge ratios for elasticnet. Returns: sklearn.linear_model.LinearModel: fitted model. mu (float): Mean value of descriptors used in training. s (float): Std dev of descriptors used in training. """ if l1_ratio is None: l1_ratio = [.1, .5, .7, .9, .95, 1] X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=test_size, random_state=random_state) # Standardize (training) data after train/test split mu = np.mean(X_train, axis=0) s = np.std(X_train, axis=0) X_scaled = (X_train - mu) / s hyperparameters = {'random_state': random_state, 'test_size': test_size, 'k_fold': k_fold, 'tol': tol, 'max_iter': max_iter } if regularization_type == 'lasso' and y.shape[1] == 1: lassocv = LassoCV(fit_intercept=True, alphas=None, tol=tol, cv=k_fold, max_iter=max_iter) lassocv.fit(X_scaled, y_train.values.ravel()) # Set optimal alpha and refit model alpha_opt = lassocv.alpha_ linear_model = Lasso(fit_intercept=True, alpha=alpha_opt, max_iter=max_iter) linear_model.fit(X_scaled, y_train.values) hyperparameters['l1_ratio'] = 1 elif regularization_type == 'ridge' and y.shape[1] == 1: ridgecv = RidgeCV(fit_intercept=True, alphas=None, cv=k_fold) ridgecv.fit(X_scaled, y_train.values.ravel()) # Set optimal alpha and refit model alpha_opt = ridgecv.alpha_ linear_model = Ridge(fit_intercept=True, alpha=alpha_opt) linear_model.fit(X_scaled, y_train) hyperparameters['l1_ratio'] = 0 elif regularization_type == 'elasticnet' and y.shape[1] == 1: elasticnetcv = ElasticNetCV(fit_intercept=True, normalize=False, alphas=None, cv=k_fold, l1_ratio=l1_ratio, max_iter=max_iter) elasticnetcv.fit(X_scaled, y_train.values.ravel()) # Set optimal alpha and l1_ratio. Refit model alpha_opt = elasticnetcv.alpha_ l1_ratio_opt = elasticnetcv.l1_ratio_ linear_model = ElasticNet(fit_intercept=True, normalize=False, l1_ratio=l1_ratio_opt, alpha=alpha_opt, max_iter=max_iter) linear_model.fit(X_scaled, y_train) hyperparameters['l1_ratio'] = l1_ratio_opt # If more than 1 outcome present, perform multitask regression elif regularization_type == 'elasticnet' and y.shape[1] > 1: multi_elasticnet_CV = MultiTaskElasticNetCV(fit_intercept=True, cv=k_fold, normalize=False, l1_ratio=l1_ratio, max_iter=max_iter) multi_elasticnet_CV.fit(X_scaled, y_train) # Set optimal alpha and l1_ratio. Refit model alpha_opt = multi_elasticnet_CV.alpha_ l1_ratio_opt = multi_elasticnet_CV.l1_ratio_ linear_model = MultiTaskElasticNet(fit_intercept=True, normalize=False, max_iter=max_iter) linear_model.set_params(alpha=alpha_opt, l1_ratio=l1_ratio_opt) linear_model.fit(X_scaled, y_train) hyperparameters['l1_ratio'] = l1_ratio_opt else: raise NotImplementedError y_pred = linear_model.predict((X_test-mu)/s) Rsq = linear_model.score((X_test - mu) / s, y_test) # Compute 95% confidence interval # Multioutput = 'raw_values' provides prediction error per output pred_actual_ratio = [x/y for x, y in zip(y_pred, np.array(y_test))] relative_prediction_error = 1.96*np.sqrt(mean_squared_error(np.ones(y_pred.shape), pred_actual_ratio, multioutput='raw_values')/y_pred.shape[0]) hyperparameters['alpha'] = alpha_opt return linear_model, mu, s, relative_prediction_error, Rsq, hyperparameters
p(mean_squared_error(lasso_predict, Y_test)) # ## Ridge # # In[25]: ridge_model = Ridge(alpha=0.01) ridge_model = ridge_model.fit(X=X_train, y=Y_train) ridge_predict = ridge_model.predict(X_test) p(mean_absolute_error(ridge_predict, Y_test)) p(mean_squared_error(ridge_predict, Y_test)) # ## Elastic Net # In[27]: enet_params = { 'alpha': [1e-7], } enet_model = MultiTaskElasticNetCV(alphas=enet_params['alpha']) enet_model = enet_model.fit(X=X_train, y=Y_train) enet_predict = enet_model.predict(X_test) p(mean_absolute_error(enet_predict, Y_test)) p(mean_squared_error(enet_predict, Y_test))
lastX = np.zeros((X_raw.shape[0], hiddenSize)) for i in range(epochs/quanta): print 'Epoch: ', i*quanta an.trainSupervised(quanta, trndata, initialLearningrate=learningrate, decay=1,#0.999, myWeightdecay=weightDecay, momentum=momentum) netTrainFs.append(an.scoreOnDS(trndata)) X, X_test = an.transform(X_raw), an.transform(X_test_raw) if (lastX == X).all(): raise 'problem' lastX = copy.deepcopy(X) clf = MultiTaskElasticNetCV() clf.fit(X, Y) predTrain = np.array(clf.predict(X)) splits = [] for col in range(predTrain.shape[1]): bestSplit, bestF1 = labanUtil.getSplitThreshold(predTrain[:, col], Y[:, col]) splits.append(bestSplit) pred = np.array(clf.predict(X_test)) for col in range(pred.shape[1]): pred[:, col] = [1 if e>=splits[col] else 0 for e in pred[:, col]] predTrain[:, col] = [1 if e>=splits[col] else 0 for e in predTrain[:, col]] testFs.append(metrics.f1_score(Y_test, pred)) trainFs.append(metrics.f1_score(Y, predTrain)) #des+='\n EN test f1: '+ str(testF) #des+=' , EN train f1: '+ str(trainF) r = range(epochs/quanta)
print "测试集MSE:", mean_squared_error(test_Y, test_Y_pred) print "测试集RMSE:", np.sqrt(mean_squared_error(test_Y, test_Y_pred)) print "测试集R2:", r2_score(test_Y, test_Y_pred) tss, rss, ess, r2 = xss(Y, multiTaskElasticNet.predict(X)) print "TSS(Total Sum of Squares): ", tss print "RSS(Residual Sum of Squares): ", rss print "ESS(Explained Sum of Squares): ", ess print "R^2: ", r2 print "\n**********测试MultiTaskElasticNetCV类**********" # 在初始化MultiTaskElasticNetCV类时, 提供一组备选的α值, MultiTaskElasticNetCV类会帮我们选择一个合适的α值. multiTaskElasticNetCV = MultiTaskElasticNetCV( alphas=[0.01, 0.1, 0.5, 1, 3, 5, 7, 10, 20, 100], cv=5) # 拟合训练集 multiTaskElasticNetCV.fit(train_X, train_Y) # 打印最优的α值 print "最优的alpha值: ", multiTaskElasticNetCV.alpha_ # 打印模型的系数 print "系数:", multiTaskElasticNetCV.coef_ print "截距:", multiTaskElasticNetCV.intercept_ print '训练集R2: ', r2_score(train_Y, multiTaskElasticNetCV.predict(train_X)) # 对于线性回归模型, 一般使用均方误差(Mean Squared Error,MSE)或者 # 均方根误差(Root Mean Squared Error,RMSE)在测试集上的表现来评该价模型的好坏. test_Y_pred = multiTaskElasticNetCV.predict(test_X) print "测试集得分:", multiTaskElasticNetCV.score(test_X, test_Y) print "测试集MSE:", mean_squared_error(test_Y, test_Y_pred) print "测试集RMSE:", np.sqrt(mean_squared_error(test_Y, test_Y_pred)) print "测试集R2:", r2_score(test_Y, test_Y_pred)