import numpy as np import pandas as pd from sklearn.ensemble import ExtraTreesRegressor from sklearn.feature_selection import SelectFromModel from sklearn.linear_model import LassoLarsCV from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.preprocessing import MaxAbsScaler from tpot.export_utils import set_param_recursive # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'], random_state=42) # Average CV score on the training set was: -0.9491441900056168 exported_pipeline = make_pipeline( SelectFromModel( estimator=ExtraTreesRegressor(max_features=0.8500000000000001, n_estimators=100), threshold=0.0), MaxAbsScaler(), LassoLarsCV(normalize=False)) # Fix random state for all the steps in exported pipeline set_param_recursive(exported_pipeline.steps, 'random_state', 42) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
vec = TfidfVectorizer() clf.fit(vec.fit_transform(docs), y) expl = explain_prediction(clf, docs[0], vec=vec) assert 'supported' in expl.error @pytest.mark.parametrize(['reg'], [ [ElasticNet(random_state=42)], [ElasticNetCV(random_state=42)], [HuberRegressor()], [Lars()], [LarsCV(max_n_alphas=10)], [Lasso(random_state=42)], [LassoCV(n_alphas=10)], [LassoLars(alpha=0.1)], [LassoLarsCV(max_n_alphas=10)], [LassoLarsIC()], [LinearRegression()], [LinearRegression(fit_intercept=False)], [LinearSVR(random_state=42)], [OrthogonalMatchingPursuit(n_nonzero_coefs=10)], [OrthogonalMatchingPursuitCV()], [PassiveAggressiveRegressor(C=0.1)], [Ridge(random_state=42)], [RidgeCV()], [SGDRegressor(**SGD_KWARGS)], [TheilSenRegressor()], [SVR(kernel='linear')], [NuSVR(kernel='linear')], ]) def test_explain_linear_regression(boston_train, reg):
def set_learning_method(config, X_train, y_train): """ Instantiates the sklearn's class corresponding to the value set in the configuration file for running the learning method. TODO: use reflection to instantiate the classes @param config: configuration object @return: an estimator with fit() and predict() methods """ estimator = None learning_cfg = config.get("learning", None) if learning_cfg: p = learning_cfg.get("parameters", None) o = learning_cfg.get("optimize", None) scorers = \ set_scorer_functions(learning_cfg.get("scorer", ['mae', 'rmse'])) method_name = learning_cfg.get("method", None) #获取方法名 if method_name == "SVR": if o: tune_params = set_optimization_params(o) print tune_params estimator = optimize_model(SVR(), X_train, y_train, tune_params, scorers, o.get("cv", 5), o.get("verbose", True), o.get("n_jobs", 1)) elif p: estimator = SVR(C=p.get("C", 10), epsilon=p.get('epsilon', 0.01), kernel=p.get('kernel', 'rbf'), degree=p.get('degree', 3), gamma=p.get('gamma', 0.0034), tol=p.get('tol', 1e-3), verbose=False) else: estimator = SVR() elif method_name == "RF": #RandomForest if o: tune_params = set_optimization_params(o) estimator = optimize_model(RandomForestRegressor(), X_train, y_train, tune_params, scorers, o.get("cv", 5), o.get("verbose", True), o.get("n_jobs", 1)) elif p: pass else: estimator = RandomForestRegressor(n_estimators=200, n_jobs=-1) elif method_name == "GB": #Gradient Boosting if o: pass elif p: pass else: estimator = GradientBoostingRegressor() elif method_name == "GP": #GaussianProcess if o: pass elif p: pass else: estimator = GaussianProcessRegressor() elif method_name == "MLP": #MLP if o: pass elif p: pass else: estimator = MLPRegressor() elif method_name == "Lasso": if o: tune_params = set_optimization_params(o) estimator = optimize_model(Lasso(), X_train, y_train, tune_params, scorers, o.get("cv", 5), o.get("verbose", True), o.get("n_jobs", 1)) elif p: estimator = Lasso(alpha=p.get('alpha', 1.0)) else: estimator = Lasso() elif method_name == "SVC": if o: tune_params = set_optimization_params(o) estimator = optimize_model(SVC(), X_train, y_train, tune_params, scorers, o.get('cv', 5), o.get('verbose', True), o.get('n_jobs', 1)) elif p: estimator = SVC(C=p.get('C', 1.0), kernel=p.get('kernel', 'rbf'), degree=p.get('degree', 3), gamma=p.get('gamma', 0.0), coef0=p.get('coef0', 0.0), tol=p.get('tol', 1e-3), verbose=p.get('verbose', False)) else: estimator = SVC() elif method_name == "LassoCV": if p: estimator = LassoCV(eps=p.get('eps', 1e-3), n_alphas=p.get('n_alphas', 100), normalize=p.get('normalize', False), precompute=p.get('precompute', 'auto'), max_iter=p.get('max_iter', 1000), tol=p.get('tol', 1e-4), cv=p.get('cv', 10), verbose=False) else: estimator = LassoCV() elif method_name == "LassoLars": if o: tune_params = set_optimization_params(o) estimator = optimize_model(LassoLars(), X_train, y_train, tune_params, scorers, o.get("cv", 5), o.get("verbose", True), o.get("n_jobs", 1)) if p: estimator = LassoLars(alpha=p.get('alpha', 1.0), fit_intercept=p.get( 'fit_intercept', True), verbose=p.get('verbose', False), normalize=p.get('normalize', True), max_iter=p.get('max_iter', 500), fit_path=p.get('fit_path', True)) else: estimator = LassoLars() elif method_name == "LassoLarsCV": if p: estimator = LassoLarsCV(max_iter=p.get('max_iter', 500), normalize=p.get('normalize', True), max_n_alphas=p.get( 'max_n_alphas', 1000), n_jobs=p.get('n_jobs', 1), cv=p.get('cv', 10), verbose=False) else: estimator = LassoLarsCV() return estimator, scorers
def get_regression_scores(X_train, X_test, Y_train, Y_test): pipelines = [] pipelines.append(('ScaledLR', Pipeline([('Scaler', StandardScaler()), ('LR', LinearRegression())]))) pipelines.append(('ScaledRIDGE', Pipeline([('Scaler', StandardScaler()), ('RIDGE', Ridge())]))) pipelines.append(('ScaledLASSO', Pipeline([('Scaler', StandardScaler()), ('LASSO', Lasso())]))) pipelines.append(('ScaledLASSOCV', Pipeline([('Scaler', StandardScaler()), ('LASSOCV', LassoCV())]))) pipelines.append(('ScaledLASSOLarsCV', Pipeline([('Scaler', StandardScaler()), ('LASSOLarsCV', LassoLarsCV())]))) pipelines.append(('ScaledEN', Pipeline([('Scaler', StandardScaler()), ('EN', ElasticNet())]))) pipelines.append(('ScaledBAYESIAN', Pipeline([('Scaler', StandardScaler()), ('BAYESIAN', BayesianRidge())]))) pipelines.append(('ScaledKNN', Pipeline([('Scaler', StandardScaler()), ('KNN', KNeighborsRegressor())]))) pipelines.append(('ScaledCART', Pipeline([('Scaler', StandardScaler()), ('CART', DecisionTreeRegressor())]))) pipelines.append(('ScaledGBM', Pipeline([('Scaler', StandardScaler()), ('GBM', GradientBoostingRegressor())]))) results = [] names = [] for name, model in pipelines: ts = time.time() kfold = KFold(n_splits=10, random_state=21) cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='neg_mean_squared_error') cv_results_abs = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='neg_mean_absolute_error') # cv_results_sq_log = cross_val_score(model, X_train, Y_train, cv = kfold, scoring = 'neg_mean_squared_log_error') cv_results_median_abs = cross_val_score( model, X_train, Y_train, cv=kfold, scoring='neg_median_absolute_error') cv_r2 = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='r2') cv_explained_variance = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='explained_variance') ts_2 = time.time() results.append(cv_results) names.append(name) msg = "%f (%f)" % (cv_results.mean(), cv_results.std()) msg_abs = "%f (%f)" % (cv_results_abs.mean(), cv_results_abs.std()) # # msg_sq_log = "%f (%f)" % (cv_results_sq_log.mean(), cv_results_sq_log.std()) msg_median_abs = "%f (%f)" % (cv_results_median_abs.mean(), cv_results_median_abs.std()) msg_r2 = "%f (%f)" % (cv_r2.mean(), cv_r2.std()) msg_explained_variance = "%f (%f)" % (cv_explained_variance.mean(), cv_explained_variance.std()) print(name) print(msg_explained_variance) print(msg_abs) print(msg) # print(msg_sq_log) print(msg_median_abs) print(msg_r2) print("%f" % (ts_2 - ts)) print('\n')
predictors['DMARRIED0'] = preprocessing.scale( predictors['DMARRIED0'].astype('float64')) predictors['DMARRIED1'] = preprocessing.scale( predictors['DMARRIED1'].astype('float64')) predictors['DUNCOV0'] = preprocessing.scale( predictors['DUNCOV0'].astype('float64')) predictors['DUNCOV1'] = preprocessing.scale( predictors['DUNCOV1'].astype('float64')) # split data into train and test sets pred_train, pred_test, resp_train, resp_test = train_test_split( predictors, target, test_size=.3, random_state=123) # specify the lasso regression model # precompute=True helpful for large data sets model = LassoLarsCV(cv=10, precompute=True).fit(pred_train, resp_train) # print variable names and regression coefficients dict(zip(predictors.columns, model.coef_)) # plot coefficient progression m_log_alphas = -np.log10(model.alphas_) ax = plt.gca() # set up axes plt.plot(m_log_alphas, model.coef_path_.T ) # alpha on x axis, change in regression coefficients on y axis plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k', label='alpha CV') plt.ylabel('Regression Coefficients') plt.xlabel('-log(alpha)')
features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=None) # Average CV score on the training set was:-109.53604510235976 exported_pipeline = make_pipeline( make_union( StackingEstimator(estimator=ExtraTreesRegressor(bootstrap=True, max_features=0.3, min_samples_leaf=11, min_samples_split=18, n_estimators=100)), make_pipeline( make_union( make_union( make_union( StackingEstimator(estimator=make_pipeline( StandardScaler(), SelectPercentile(score_func=f_regression, percentile=20), MaxAbsScaler(), RidgeCV())), FunctionTransformer(copy)), FunctionTransformer(copy)), StandardScaler()), MaxAbsScaler(), StackingEstimator(estimator=RidgeCV()), PolynomialFeatures(degree=2, include_bias=False, interaction_only=False))), LassoLarsCV(normalize=False)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
ivars2 = [] depvars = [] columns = [] for pyear in player_years: ivars.append([pt_projs[pyear][system] for system in proj_systems]) depvars.append(pt_actuals[pyear]['actual']) for pyear in pt_projs_curr.keys(): ivars2.append( [pt_projs_curr[pyear][system] for system in proj_systems]) x = numpy.array(ivars) x2 = numpy.array(ivars2) y = numpy.array(depvars) model_pt = LassoLarsCV(cv=cv_num) model_pt.fit(x, y) print("Rough PT model, to choose sample") for system, coef in zip(proj_systems, model_pt.coef_): print("%40s : %f" % (system, coef)) print("%40s : %f" % ('intercept', model_pt.intercept_)) sample_proj_pt_arr = model_pt.predict(x) curr_proj_pt_arr = model_pt.predict(x2) sample_proj_pt = dict(zip(player_years, sample_proj_pt_arr)) curr_proj_pt = dict(zip(pt_projs_curr.keys(), curr_proj_pt_arr)) models = {}
def _build_distance_estimator(X, y, w2v, PoS, NER, regressor, verbose=1): """Build a vector reprensation of a pair of signatures.""" if w2v == 'glove': PairVecTransformer = PairGloveTransformer elif w2v == 'spacy': PairVecTransformer = PairSpacyVecTransformer elif w2v == 'polyglot': PairVecTransformer = PairPolyglotVecTransformer else: print('error passing w2v argument value') if PoS == 'polyglot': get_nouns = polyglot_nouns get_verbs = polyglot_verbs get_words = polyglot_words get_particle = polyglot_particle get_interjection = polyglot_interjection get_symbol = polyglot_symbol get_numbers = polyglot_numbers get_proper_nouns = polyglot_proper_nouns get_pronouns = polyglot_pronouns get_auxiliary_verbs = polyglot_auxiliary_verbs get_adjectives = polyglot_adjectives get_adverbs = polyglot_adverbs get_punctuation = polyglot_punctuation get_determiner = polyglot_determiner get_coordinating_conjunction = polyglot_coordinating_conjunction get_adpositions = polyglot_adpositions get_others = polyglot_others get_subordinating_conjunctions = polyglot_subordinating_conjunctions elif PoS == 'spacy': get_nouns = spacy_noun get_verbs = spacy_verb get_words = spacy_tokens get_particle = spacy_part get_interjection = spacy_intj get_symbol = spacy_sym get_numbers = spacy_num get_proper_nouns = spacy_propn get_pronouns = spacy_pron get_auxiliary_verbs = spacy_aux get_adjectives = spacy_adj get_adverbs = spacy_adv get_punctuation = spacy_punct get_determiner = spacy_det get_coordinating_conjunction = spacy_conj get_adpositions = spacy_adp get_others = spacy_x get_subordinating_conjunctions = spacy_sconj else: print('error passing PoS argument value') transformer = FeatureUnion([ ("get_nouns", Pipeline(steps=[ ('pairtransformer', PairTransformer(element_transformer=FuncTransformer( dtype=None, func=get_nouns), groupby=None)), ('sop', SmallerOtherParing()), ('pgt', PairVecTransformer()), ('rgpc', RefGroupPairCosine()), ('gm', GetMatches()), ('sd', SolveDuplicate()), ('ac', AvgPOSCombiner()), ])), ("get_verbs", Pipeline(steps=[ ('pairtransformer', PairTransformer(element_transformer=FuncTransformer( dtype=None, func=get_verbs), groupby=None)), ('sop', SmallerOtherParing()), ('pgt', PairVecTransformer()), ('rgpc', RefGroupPairCosine()), ('gm', GetMatches()), ('sd', SolveDuplicate()), ('ac', AvgPOSCombiner()), ])), ("get_words", Pipeline(steps=[ ('pairtransformer', PairTransformer(element_transformer=FuncTransformer( dtype=None, func=get_words), groupby=None)), ('sop', SmallerOtherParing()), ('pgt', PairVecTransformer()), ('rgpc', RefGroupPairCosine()), ('gm', GetMatches()), ('sd', SolveDuplicate()), ('ac', AvgPOSCombiner()), ])), ("get_particle", Pipeline(steps=[ ('pairtransformer', PairTransformer(element_transformer=FuncTransformer( dtype=None, func=get_particle), groupby=None)), ('sop', SmallerOtherParing()), ('pgt', PairVecTransformer()), ('rgpc', RefGroupPairCosine()), ('gm', GetMatches()), ('sd', SolveDuplicate()), ('ac', AvgPOSCombiner()), ])), ("get_interjection", Pipeline(steps=[ ('pairtransformer', PairTransformer(element_transformer=FuncTransformer( dtype=None, func=get_interjection), groupby=None)), ('sop', SmallerOtherParing()), ('pgt', PairVecTransformer()), ('rgpc', RefGroupPairCosine()), ('gm', GetMatches()), ('sd', SolveDuplicate()), ('ac', AvgPOSCombiner()), ])), ("get_symbol", Pipeline(steps=[ ('pairtransformer', PairTransformer(element_transformer=FuncTransformer( dtype=None, func=get_symbol), groupby=None)), ('sop', SmallerOtherParing()), ('pgt', PairVecTransformer()), ('rgpc', RefGroupPairCosine()), ('gm', GetMatches()), ('sd', SolveDuplicate()), ('ac', AvgPOSCombiner()), ])), ("num_diff", Pipeline(steps=[ ('pairtransformer', PairTransformer(element_transformer=Pipeline([ ("rsn", FuncTransformer(func=replace_spelled_numbers)), ("get_num", FuncTransformer(func=get_numbers)), ("to_num", FuncTransformer(func=to_numeric)), ]), groupby=None)), ('1st_nm_comb', NumCombiner()), ])), ("get_proper_nouns", Pipeline(steps=[ ('pairtransformer', PairTransformer(element_transformer=FuncTransformer( dtype=None, func=get_proper_nouns), groupby=None)), ('sop', SmallerOtherParing()), ('pgt', PairVecTransformer()), ('rgpc', RefGroupPairCosine()), ('gm', GetMatches()), ('sd', SolveDuplicate()), ('ac', AvgPOSCombiner()), ])), ("get_pronouns", Pipeline(steps=[ ('pairtransformer', PairTransformer(element_transformer=FuncTransformer( dtype=None, func=get_pronouns), groupby=None)), ('sop', SmallerOtherParing()), ('pgt', PairVecTransformer()), ('rgpc', RefGroupPairCosine()), ('gm', GetMatches()), ('sd', SolveDuplicate()), ('ac', AvgPOSCombiner()), ])), ("get_auxiliary_verbs", Pipeline(steps=[ ('pairtransformer', PairTransformer(element_transformer=FuncTransformer( dtype=None, func=get_auxiliary_verbs), groupby=None)), ('sop', SmallerOtherParing()), ('pgt', PairVecTransformer()), ('rgpc', RefGroupPairCosine()), ('gm', GetMatches()), ('sd', SolveDuplicate()), ('ac', AvgPOSCombiner()), ])), ("adjectives_glove", Pipeline(steps=[ ('pairtransformer', PairTransformer(element_transformer=FuncTransformer( dtype=None, func=get_adjectives), groupby=None)), ('sop', SmallerOtherParing()), ('pgt', PairVecTransformer()), ('rgpc', RefGroupPairCosine()), ('gm', GetMatches()), ('sd', SolveDuplicate()), ('ac', AvgPOSCombiner()), ])), ("adverbs_glove", Pipeline(steps=[ ('pairtransformer', PairTransformer(element_transformer=FuncTransformer( dtype=None, func=get_adverbs), groupby=None)), ('sop', SmallerOtherParing()), ('pgt', PairVecTransformer()), ('rgpc', RefGroupPairCosine()), ('gm', GetMatches()), ('sd', SolveDuplicate()), ('ac', AvgPOSCombiner()), ])), ("get_punctuation", Pipeline(steps=[ ('pairtransformer', PairTransformer(element_transformer=FuncTransformer( dtype=None, func=get_punctuation), groupby=None)), ('sop', SmallerOtherParing()), ('pgt', PairVecTransformer()), ('rgpc', RefGroupPairCosine()), ('gm', GetMatches()), ('sd', SolveDuplicate()), ('ac', AvgPOSCombiner()), ])), ("get_determiner", Pipeline(steps=[ ('pairtransformer', PairTransformer(element_transformer=FuncTransformer( dtype=None, func=get_determiner), groupby=None)), ('sop', SmallerOtherParing()), ('pgt', PairVecTransformer()), ('rgpc', RefGroupPairCosine()), ('gm', GetMatches()), ('sd', SolveDuplicate()), ('ac', AvgPOSCombiner()), ])), ("get_coordinating_conjunction", Pipeline(steps=[ ('pairtransformer', PairTransformer(element_transformer=FuncTransformer( dtype=None, func=get_coordinating_conjunction), groupby=None)), ('sop', SmallerOtherParing()), ('pgt', PairVecTransformer()), ('rgpc', RefGroupPairCosine()), ('gm', GetMatches()), ('sd', SolveDuplicate()), ('ac', AvgPOSCombiner()), ])), ("get_adpositions", Pipeline(steps=[ ('pairtransformer', PairTransformer(element_transformer=FuncTransformer( dtype=None, func=get_adpositions), groupby=None)), ('sop', SmallerOtherParing()), ('pgt', PairVecTransformer()), ('rgpc', RefGroupPairCosine()), ('gm', GetMatches()), ('sd', SolveDuplicate()), ('ac', AvgPOSCombiner()), ])), ("get_others", Pipeline(steps=[ ('pairtransformer', PairTransformer(element_transformer=FuncTransformer( dtype=None, func=get_others), groupby=None)), ('sop', SmallerOtherParing()), ('pgt', PairVecTransformer()), ('rgpc', RefGroupPairCosine()), ('gm', GetMatches()), ('sd', SolveDuplicate()), ('ac', AvgPOSCombiner()), ])), ("get_subordinating_conjunctions", Pipeline(steps=[ ('pairtransformer', PairTransformer(element_transformer=FuncTransformer( dtype=None, func=get_subordinating_conjunctions), groupby=None)), ('sop', SmallerOtherParing()), ('pgt', PairVecTransformer()), ('rgpc', RefGroupPairCosine()), ('gm', GetMatches()), ('sd', SolveDuplicate()), ('ac', AvgPOSCombiner()), ])), ("spacy_eol", Pipeline(steps=[ ('pairtransformer', PairTransformer(element_transformer=FuncTransformer( dtype=None, func=spacy_eol), groupby=None)), ('sop', SmallerOtherParing()), ('pgt', PairVecTransformer()), ('rgpc', RefGroupPairCosine()), ('gm', GetMatches()), ('sd', SolveDuplicate()), ('ac', AvgPOSCombiner()), ])), ("spacy_space", Pipeline(steps=[ ('pairtransformer', PairTransformer(element_transformer=FuncTransformer( dtype=None, func=spacy_space), groupby=None)), ('sop', SmallerOtherParing()), ('pgt', PairVecTransformer()), ('rgpc', RefGroupPairCosine()), ('gm', GetMatches()), ('sd', SolveDuplicate()), ('ac', AvgPOSCombiner()), ])), ("spacy_organizations", Pipeline(steps=[ ('pairtransformer', PairTransformer(element_transformer=FuncTransformer( dtype=None, func=spacy_organizations), groupby=None)), ('sop', SmallerOtherParing()), ('pgt', PairVecTransformer()), ('rgpc', RefGroupPairCosine()), ('gm', GetMatches()), ('sd', SolveDuplicate()), ('ac', AvgPOSCombiner()), ])), ("spacy_persons", Pipeline(steps=[ ('pairtransformer', PairTransformer(element_transformer=FuncTransformer( dtype=None, func=spacy_persons), groupby=None)), ('sop', SmallerOtherParing()), ('pgt', PairVecTransformer()), ('rgpc', RefGroupPairCosine()), ('gm', GetMatches()), ('sd', SolveDuplicate()), ('ac', AvgPOSCombiner()), ])), ("spacy_locations", Pipeline(steps=[ ('pairtransformer', PairTransformer(element_transformer=FuncTransformer( dtype=None, func=spacy_locations), groupby=None)), ('sop', SmallerOtherParing()), ('pgt', PairVecTransformer()), ('rgpc', RefGroupPairCosine()), ('gm', GetMatches()), ('sd', SolveDuplicate()), ('ac', AvgPOSCombiner()), ])), ("spacy_groups", Pipeline(steps=[ ('pairtransformer', PairTransformer(element_transformer=FuncTransformer( dtype=None, func=spacy_groups), groupby=None)), ('sop', SmallerOtherParing()), ('pgt', PairVecTransformer()), ('rgpc', RefGroupPairCosine()), ('gm', GetMatches()), ('sd', SolveDuplicate()), ('ac', AvgPOSCombiner()), ])), ("spacy_facilities", Pipeline(steps=[ ('pairtransformer', PairTransformer(element_transformer=FuncTransformer( dtype=None, func=spacy_facilities), groupby=None)), ('sop', SmallerOtherParing()), ('pgt', PairVecTransformer()), ('rgpc', RefGroupPairCosine()), ('gm', GetMatches()), ('sd', SolveDuplicate()), ('ac', AvgPOSCombiner()), ])), ("spacy_geo_locations", Pipeline(steps=[ ('pairtransformer', PairTransformer(element_transformer=FuncTransformer( dtype=None, func=spacy_geo_locations), groupby=None)), ('sop', SmallerOtherParing()), ('pgt', PairVecTransformer()), ('rgpc', RefGroupPairCosine()), ('gm', GetMatches()), ('sd', SolveDuplicate()), ('ac', AvgPOSCombiner()), ])), ("spacy_products", Pipeline(steps=[ ('pairtransformer', PairTransformer(element_transformer=FuncTransformer( dtype=None, func=spacy_products), groupby=None)), ('sop', SmallerOtherParing()), ('pgt', PairVecTransformer()), ('rgpc', RefGroupPairCosine()), ('gm', GetMatches()), ('sd', SolveDuplicate()), ('ac', AvgPOSCombiner()), ])), ("spacy_events", Pipeline(steps=[ ('pairtransformer', PairTransformer(element_transformer=FuncTransformer( dtype=None, func=spacy_events), groupby=None)), ('sop', SmallerOtherParing()), ('pgt', PairVecTransformer()), ('rgpc', RefGroupPairCosine()), ('gm', GetMatches()), ('sd', SolveDuplicate()), ('ac', AvgPOSCombiner()), ])), ("spacy_work_of_arts", Pipeline(steps=[ ('pairtransformer', PairTransformer(element_transformer=FuncTransformer( dtype=None, func=spacy_work_of_arts), groupby=None)), ('sop', SmallerOtherParing()), ('pgt', PairVecTransformer()), ('rgpc', RefGroupPairCosine()), ('gm', GetMatches()), ('sd', SolveDuplicate()), ('ac', AvgPOSCombiner()), ])), ("spacy_laws", Pipeline(steps=[ ('pairtransformer', PairTransformer(element_transformer=FuncTransformer( dtype=None, func=spacy_laws), groupby=None)), ('sop', SmallerOtherParing()), ('pgt', PairVecTransformer()), ('rgpc', RefGroupPairCosine()), ('gm', GetMatches()), ('sd', SolveDuplicate()), ('ac', AvgPOSCombiner()), ])), ("spacy_languages", Pipeline(steps=[ ('pairtransformer', PairTransformer(element_transformer=FuncTransformer( dtype=None, func=spacy_languages), groupby=None)), ('sop', SmallerOtherParing()), ('pgt', PairVecTransformer()), ('rgpc', RefGroupPairCosine()), ('gm', GetMatches()), ('sd', SolveDuplicate()), ('ac', AvgPOSCombiner()), ])), ("sent_tfidf", Pipeline([ ("pairs", PairTransformer(element_transformer=Pipeline( [("1st_verb", FuncTransformer( func=get_text)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(analyzer="char_wb", ngram_range=(2, 3), dtype=np.float32, decode_error="replace", stop_words="english"))]))), ("combiner", CosineSimilarity()) ])), ("sent_len_diff", Pipeline(steps=[ ('pairtransformer', PairTransformer(element_transformer=FuncTransformer(dtype=None, func=len), groupby=None)), ('abs_diff', AbsoluteDifference()), ])), ]) # Train a classifier on these vectors if regressor == 'lasso': classifier = LassoLarsCV(cv=5, max_iter=512, n_jobs=-1) elif regressor == 'RF': classifier = RandomForestRegressor(n_jobs=-1, max_depth=8, n_estimators=1024) else: print('Error passing the regressor type') # Return the whole pipeline estimator = Pipeline([("transformer", transformer), ("classifier", classifier)]).fit(X, y) return estimator
color='k', label='alpha: CV estimate') plt.legend() plt.xlabel('-log(alpha)') plt.ylabel('Mean square error') plt.title('Mean square error on each fold: coordinate descent ' '(train time: %.2fs)' % t_cv) plt.axis('tight') plt.ylim(2300, 4000) plt.show() # ############################################################################# # LassoLarsCV: least angle regression t1 = time.time() model = LassoLarsCV(cv=10) model.fit(x, y) t_lasso_lars_cv = time.time() - t1 alphas_log = -np.log10(model.cv_alphas_) plt.figure() plt.plot(alphas_log, model.mse_path_, ':') plt.plot(alphas_log, model.mse_path_.mean(axis=-1), 'k', label='Average across the folds', linewidth=2) plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k', label='alpha CV')
#!/usr/bin/env python import pandas as pd from sklearn.model_selection import train_test_split from sklearn.linear_model import LassoLarsCV data = pd.read_csv("dataset.csv", header=0) X = data.loc[:, ["Commune", "Etage", "Superficie", "Piece"]].values Y = data.loc[:, "Prix"].values X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2) regressor = LassoLarsCV(cv=15) regressor.fit(X_train, Y_train) score = regressor.score(X_test, Y_test) print(score)
build_auto( BaggingRegressor(DecisionTreeRegressor(random_state=13, min_samples_leaf=5), random_state=13, n_estimators=3, max_features=0.5), "DecisionTreeEnsembleAuto") build_auto(DummyRegressor(strategy="median"), "DummyAuto") build_auto(ElasticNetCV(random_state=13), "ElasticNetAuto") build_auto(ExtraTreesRegressor(random_state=13, min_samples_leaf=5), "ExtraTreesAuto") build_auto(GradientBoostingRegressor(random_state=13, init=None), "GradientBoostingAuto") build_auto(HuberRegressor(), "HuberAuto") build_auto(LarsCV(), "LarsAuto") build_auto(LassoCV(random_state=13), "LassoAuto") build_auto(LassoLarsCV(), "LassoLarsAuto") build_auto(OptimalLGBMRegressor(objective="regression", n_estimators=17, num_iteration=11), "LGBMAuto", num_iteration=11) build_auto(LinearRegression(), "LinearRegressionAuto") build_auto( BaggingRegressor(LinearRegression(), random_state=13, max_features=0.75), "LinearRegressionEnsembleAuto") build_auto(OrthogonalMatchingPursuitCV(), "OMPAuto") build_auto(RandomForestRegressor(random_state=13, min_samples_leaf=3), "RandomForestAuto", flat=True) build_auto(RidgeCV(), "RidgeAuto")
## Scikit Learn ## ##################################################################### lasso_model = LassoCV() lasso_model.fit(x_train_values, y_train_values) lasso_model_predictions = lasso_model.predict(x_test_values) generate_submission_file(lasso_model_predictions, test_data["Id"], "../results/" + user + "_LassoCV.csv") lars_model = LarsCV() lars_model.fit(x_train_values, y_train_values) lars_model_predictions = lars_model.predict(x_test_values) generate_submission_file(lars_model_predictions, test_data["Id"], "../results/" + user + "_LarsCV.csv") lassolars_model = LassoLarsCV() lassolars_model.fit(x_train_values, y_train_values) lassolars_model_predictions = lassolars_model.predict(x_test_values) generate_submission_file(lassolars_model_predictions, test_data["Id"], "../results/" + user + "_LassoLarsCV.csv") en_model = ElasticNetCV() en_model.fit(x_train_values, y_train_values) en_model_predictions = en_model.predict(x_test_values) generate_submission_file(en_model_predictions, test_data["Id"], "../results/" + user + "_ElasticNetCV.csv") ##################################################################### ## XGBoost ## #####################################################################
level_1_models = [ XgbWrapper(seed=SEED, params=xgb_params1, cv_fold=4), XgbWrapper(seed=SEED, params=xgb_params2, cv_fold=4), #XgbWrapper(seed=SEED, params=xgb_params3), XgbWrapper(seed=SEED, params=xgb_params4, cv_fold=4) ] # level_1_models = level_1_models + [SklearnWrapper(clf=KNeighborsRegressor, params=knr_params1), # SklearnWrapper(clf=KNeighborsRegressor, params=knr_params2), # SklearnWrapper(clf=KNeighborsRegressor, params=knr_params3), # SklearnWrapper(clf=KNeighborsRegressor, params=knr_params4)] level_1_models = level_1_models + [ SklearnWrapper(make_pipeline(ZeroCount(), LassoLarsCV(normalize=True))), #LB 0.55797 SklearnWrapper( make_pipeline( StackingEstimator(estimator=LassoLarsCV(normalize=True)), StackingEstimator( estimator=GradientBoostingRegressor(learning_rate=0.001, loss="huber", max_depth=3, max_features=0.55, min_samples_leaf=18, min_samples_split=14, subsample=0.7)), LassoLarsCV())) ] params_list = [
pl.imshow(bg.get_data()[:, :, 10], interpolation="nearest", cmap='gray') pl.imshow(np.ma.masked_less(sbrain.get_data()[:, :, 10], 1e-6), interpolation="nearest", cmap='hot') plot_lines(contour[:, :, 10]) pl.axis('off') pl.subplots_adjust(left=0., right=1., bottom=0., top=1.) pl.savefig('encoding_scores.pdf') pl.savefig('encoding_scores.eps') pl.clf() ### Compute receptive fields from sklearn.linear_model import LassoLarsCV lasso = LassoLarsCV(max_iter=10, ) p = (4, 2) # Mask for chosen pixel pixmask = np.zeros((10, 10), dtype=bool) pixmask[p] = 1 for index in [1780, 1951, 2131, 1935]: rf = lasso.fit(y_train, X_train[:, index]).coef_.reshape(10, 10) pl.figure(figsize=(8, 8)) pl.imshow(rf, vmin=0, interpolation="nearest", cmap='hot') plot_lines(pixmask, linewidth=6) pl.axis('off') pl.subplots_adjust(left=0., right=1., bottom=0., top=1.) pl.savefig('encoding_%d.pdf' % index) pl.savefig('encoding_%d.eps' % index)
# pro =classify_model_001.predict_proba(X_testset_001[i]) # print pro[0] print class_one, predict_one print class_two, predict_two ## 构建回归模型 ### 构建0.003的回归模型 from sklearn.linear_model import BayesianRidge, RANSACRegressor, RidgeCV, Ridge, LassoLarsCV X_trainset_0003 = [] y_trainset_0003 = [] for i in range(0, y_trainset.__len__(), 1): if y_trainset[i] < 0.003: X_trainset_0003.append(X_trainset[i]) y_trainset_0003.append(y_trainset[i]) reg_0003 = LassoLarsCV() reg_0003.fit(X_trainset_0003, y_trainset_0003) X_testset_0003 = [] y_testset_0003 = [] for i in range(0, y_testset.__len__(), 1): if y_testset[i] < 0.003: X_testset_0003.append(X_testset[i]) y_testset_0003.append(y_testset[i]) reg_0003_result = reg_0003.predict(X_testset_0003) mse_0003 = 0.0 for i in range(0, y_testset_0003.__len__(), 1): print reg_0003_result[i], y_testset_0003[i] mse_0003 += abs(reg_0003_result[i] - y_testset_0003[i]) print mse_0003 / y_testset_0003.__len__()
X = check_array(X) X_transformed = np.copy(X) # add class probabilities as a synthetic feature if issubclass(self.estimator.__class__, ClassifierMixin) and hasattr( self.estimator, 'predict_proba'): X_transformed = np.hstack((self.estimator.predict_proba(X), X)) # add class prodiction as a synthetic feature X_transformed = np.hstack((np.reshape(self.estimator.predict(X), (-1, 1)), X_transformed)) return X_transformed stacked_pipeline = make_pipeline( StackingEstimator(estimator=LassoLarsCV(normalize=True)), StackingEstimator(estimator=GradientBoostingRegressor(learning_rate=0.001, loss="huber", max_depth=3, max_features=0.55, min_samples_leaf=18, min_samples_split=14, subsample=0.7)), LassoLarsCV()) stacked_pipeline.fit(finaltrainset, y_train) results = stacked_pipeline.predict(finaltestset) '''R2 Score on the entire Train data when averaging''' print('R2 score on train data:') print(
# Splitting the test and training data for building better prediction model X_train, X_test, y_train, y_test = cv.train_test_split(predictors, y, test_size=0.2) #print (X_train.shape, y_train.shape) #print (X_test.shape, y_test.shape) #Fitting the model lr = LinearRegression() lr.fit(X_train, y_train) predictions = lr.predict(X_test) lasso = Lasso(alpha=1) res = lasso.fit(X_train,y_train) #print("Coefficients lasso training fit of", res.coef_.tolist()) print('Lasso:',lasso) # specify the lasso regression model model=LassoLarsCV(cv=10, precompute=False).fit(X_train, y_train) #K fold is yen and not to use precomputed matrix.Here first fold is the validation set and the remaining 9 folds estimate the model # print variable names and regression coefficients print ('Coefficients from lasso lars',dict(zip(X_train.columns, model.coef_)) )#dic object creates dictionary and zip object creates lists # Fit the regressor to the data #las=lasso.fit(predictors, y) #plot mean square error for each fold print("Computing regularization path using the Lars lasso...") m_log_alphascv = -np.log10(model.cv_alphas_) #print("Log alphas:",m_log_alphascv,"MSE:",model.cv_mse_path_) pyplot.figure() pyplot.plot(m_log_alphascv, model.cv_mse_path_, ':') pyplot.plot(m_log_alphascv, model.cv_mse_path_.mean(axis=-1), 'k', label='Average across the folds', linewidth=2)
plt.legend() plt.xlabel('-log(alpha)') plt.ylabel('Mean square error') plt.title('Mean square error on each fold: coordinate descent ' '(train time: %.2fs)' % t_lasso_cv) plt.axis('tight') plt.ylim(ymin, ymax) # ############################################################################# # LassoLarsCV: least angle regression # Compute paths print("Computing regularization path using the Lars lasso...") t1 = time.time() model = LassoLarsCV(cv=20).fit(X, y) t_lasso_lars_cv = time.time() - t1 # Display results m_log_alphas = -np.log10(model.cv_alphas_) plt.figure() plt.plot(m_log_alphas, model.mse_path_, ':') plt.plot(m_log_alphas, model.mse_path_.mean(axis=-1), 'k', label='Average across the folds', linewidth=2) plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k', label='alpha CV') plt.legend() plt.xlabel('-log(alpha)') plt.ylabel('Mean square error')
rmse_l.append(rmse) cplx_l.append(cplx) fw = open(f'models/{dataname}_{algname}_{fold}_{it}.pkl', 'wb') pickle.dump(model, fw) fw.close() print(f'it: {it}, {rmse}, {cplx}') return dataset_l, algoritmo_l, fold_l, mae_l, rmse_l, cplx_l dataset_l, algoritmo_l, fold_l = [], [], [] mae_l, rmse_l, cplx_l = [], [], [] algname = 'IT-ELM (Lasso)' modelCV = LassoLarsCV(n_jobs=-1) model_fn = ITELM #for dataname in fnames: dataname = sys.argv[1] fold = int(sys.argv[2]) print(f'====================\nData set: {dataname}\n====================\n') dat_l, alg_l, f_l, ab_l, sq_l, cp_l = run_gridSearch(dataname, fold, model_fn, algname, modelCV) dataset_l += dat_l algoritmo_l += alg_l fold_l += f_l mae_l += ab_l rmse_l += sq_l
tvt_modifier_baseline_reps, tvt_modifier_return_mean ]) if rep in none_model_reps: predictions, model, results = validation_tools.make_predictions( train, validate, test, metrics, None, run_type=run_type) else: model_fname = f"./models/{d}__{s}__{rep}__model.pkl" if to_train == True: kfold = KFold(n_splits=10, random_state=42, shuffle=True) model_to_pass = LassoLarsCV(fit_intercept=True, normalize=True, n_jobs=-1, max_n_alphas=6000, cv=kfold) else: model_to_pass = joblib.load(model_fname) predictions, model, results = validation_tools.make_predictions( train, validate, test, metrics, model=model_to_pass, run_type=run_type, to_train=to_train) if to_train == True: joblib.dump(model, model_fname)
mseOLS = np.mean((bh['PRICE'] - lr.predict(x))**2) R2OLS = lr.score(x,y) print(mseOLS) ## MSE do modelo OLS ## print(R2OLS) ## R² do modelo OLS ## ### 3) LARS ### import time import matplotlib.pyplot as plt from sklearn.linear_model import LassoLarsCV from sklearn import linear_model ## Computing regularization path using the Lars lasso... ## t1 = time.time() model = LassoLarsCV(cv=10).fit(x, y) t_lasso_lars_cv = time.time() - t1 # Display results m_log_alphas = -np.log10(model.cv_alphas_) plt.figure() plt.plot(m_log_alphas, model.mse_path_, ':') plt.plot(m_log_alphas, model.mse_path_.mean(axis=-1), 'k', label='Average across the folds', linewidth=2) plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k', label='alpha CV') plt.legend()
plt.xlabel(r"$\alpha$") plt.ylabel("Mean square error") plt.title( "Mean square error on each fold: coordinate descent (train time: %.2fs)" % t_lasso_cv) plt.axis("tight") plt.ylim(ymin, ymax) # ############################################################################# # LassoLarsCV: least angle regression # Compute paths print("Computing regularization path using the Lars lasso...") t1 = time.time() model = LassoLarsCV(cv=20, normalize=False).fit(X, y) t_lasso_lars_cv = time.time() - t1 # Display results plt.figure() plt.semilogx(model.cv_alphas_ + EPSILON, model.mse_path_, ":") plt.semilogx( model.cv_alphas_ + EPSILON, model.mse_path_.mean(axis=-1), "k", label="Average across the folds", linewidth=2, ) plt.axvline(model.alpha_, linestyle="--", color="k", label="alpha CV") plt.legend()
learning_rate=0.05, subsample=0.8), XGBRegressor(seed=0, n_estimators=500, max_depth=10, learning_rate=0.05, subsample=0.8, colsample_bytree=0.75), XGBRegressor(seed=0, n_estimators=500, max_depth=7, learning_rate=0.05, subsample=0.8, colsample_bytree=0.75), LassoCV(alphas=[1, 0.1, 0.001, 0.0005]), KNeighborsRegressor(n_neighbors=5), KNeighborsRegressor(n_neighbors=10), KNeighborsRegressor(n_neighbors=15), KNeighborsRegressor(n_neighbors=25), KNeighborsRegressor(n_neighbors=35), LassoLarsCV(), ElasticNet(), SVR() ] ensem = ensemble(n_folds=5, stacker=Ridge(), base_models=base_models) X_train, X_test, y_train = data_preprocess(train, test) y_pred, score = ensem.fit_predict(X_train, X_test, y_train) create_submission(np.expm1(y_pred), score)
return df_fea2, uni_feature df_fea2, uni_feature = featureSelectSVC(X, y) print(uni_feature) ## RandomizedLasso, feature stability selection from sklearn.linear_model import (RandomizedLasso, lasso_stability_path, LassoLarsCV) import warnings from sklearn.exceptions import ConvergenceWarning with warnings.catch_warnings(): warnings.simplefilter('ignore', UserWarning) warnings.simplefilter('ignore', ConvergenceWarning) lars_cv = LassoLarsCV(cv=6).fit(X, y) alphas = np.linspace(lars_cv.alphas_[0], .1 * lars_cv.alphas_[0], 6) clf = RandomizedLasso(alpha=alphas, random_state=42).fit(X, y) names = df_merge3.columns.tolist()[:-1] print(sorted(zip(map(lambda x: round(x, 4), clf.scores_), names), reverse=True)) from sklearn.ensemble import ExtraTreesClassifier clf = ExtraTreesClassifier() clf = clf.fit(X, y) df_tree = pd.DataFrame(clf.feature_importances_) df_tree['fea_index'] = df_merge3.columns.tolist()[:-1] df_tree.columns = ["weight", "feature_index"] df_tree.sort_values("weight").tail(10)
def QuickML_Ensembling(X_train, y_train, X_test, y_test='', modeltype='Regression', Boosting_Flag=False, scoring='', verbose=0): """ Quickly builds and runs multiple models for a clean data set(only numerics). """ start_time = time.time() seed = 99 if len(X_train) <= 100000 or X_train.shape[1] < 50: NUMS = 100 FOLDS = 5 else: NUMS = 200 FOLDS = 10 ## create Voting models estimators = [] if modeltype == 'Regression': if scoring == '': scoring = 'neg_mean_squared_error' scv = ShuffleSplit(n_splits=FOLDS, random_state=seed) if Boosting_Flag is None: model5 = BaggingRegressor(DecisionTreeRegressor(random_state=seed), n_estimators=NUMS, random_state=seed) results1 = model5.fit(X_train, y_train).predict(X_test) if not isinstance(y_test, str): metrics1 = rmse(results1, y_test).mean() else: metrics1 = 0 estimators.append(('Bagging1', model5, metrics1)) else: model5 = LassoLarsCV(cv=scv) results1 = model5.fit(X_train, y_train).predict(X_test) if not isinstance(y_test, str): metrics1 = rmse(results1, y_test).mean() else: metrics1 = 0 estimators.append(('LassoLarsCV', model5, metrics1)) model6 = LassoCV(alphas=np.logspace(-10, -1, 50), cv=scv, random_state=seed) results2 = model6.fit(X_train, y_train).predict(X_test) if not isinstance(y_test, str): metrics2 = rmse(results2, y_test).mean() else: metrics2 = 0 estimators.append(('LassoCV', model6, metrics2)) model7 = RidgeCV(alphas=np.logspace(-10, -1, 50), cv=scv) results3 = model7.fit(X_train, y_train).predict(X_test) if not isinstance(y_test, str): metrics3 = rmse(results3, y_test).mean() else: metrics3 = 0 estimators.append(('RidgeCV', model7, metrics3)) ## Create an ensemble model #### if Boosting_Flag: model8 = BaggingRegressor(DecisionTreeRegressor(random_state=seed), n_estimators=NUMS, random_state=seed) results4 = model8.fit(X_train, y_train).predict(X_test) if not isinstance(y_test, str): metrics4 = rmse(results4, y_test).mean() else: metrics4 = 0 estimators.append(('Bagging2', model8, metrics4)) else: model8 = AdaBoostRegressor(base_estimator=DecisionTreeRegressor( min_samples_leaf=2, max_depth=1, random_state=seed), n_estimators=NUMS, random_state=seed) results4 = model8.fit(X_train, y_train).predict(X_test) if not isinstance(y_test, str): metrics4 = rmse(results4, y_test).mean() else: metrics4 = 0 estimators.append(('Boosting', model8, metrics4)) estimators_list = [(tuples[0], tuples[1]) for tuples in estimators] estimator_names = [tuples[0] for tuples in estimators] if verbose >= 2: print('QuickML_Ensembling Model results:') print( ' %s = %0.4f \n %s = %0.4f\n %s = %0.4f \n %s = %0.4f' % (estimator_names[0], metrics1, estimator_names[1], metrics2, estimator_names[2], metrics3, estimator_names[3], metrics4)) else: if scoring == '': scoring = 'accuracy' scv = StratifiedKFold(n_splits=FOLDS, random_state=seed) if Boosting_Flag is None: model5 = ExtraTreesClassifier(n_estimators=NUMS, min_samples_leaf=2, random_state=seed) results1 = model5.fit(X_train, y_train).predict(X_test) if not isinstance(y_test, str): metrics1 = accu(results1, y_test).mean() else: metrics1 = 0 estimators.append(('Bagging', model5, metrics1)) else: model5 = LogisticRegressionCV(Cs=np.linspace(0.01, 100, 20), cv=scv, scoring=scoring, random_state=seed) results1 = model5.fit(X_train, y_train).predict(X_test) if not isinstance(y_test, str): metrics1 = accu(results1, y_test).mean() else: metrics1 = 0 estimators.append(('Logistic Regression', model5, metrics1)) model6 = LinearDiscriminantAnalysis() results2 = model6.fit(X_train, y_train).predict(X_test) if not isinstance(y_test, str): metrics2 = accu(results2, y_test).mean() else: metrics2 = 0 estimators.append(('Linear Discriminant', model6, metrics2)) if modeltype == 'Binary_Classification': float_cols = X_train.columns[( X_train.dtypes == float).values].tolist() int_cols = X_train.columns[(X_train.dtypes == int).values].tolist() if (X_train[float_cols + int_cols] < 0).astype(int).sum().sum() > 0: model7 = DecisionTreeClassifier(max_depth=5) else: model7 = GaussianNB() else: float_cols = X_train.columns[( X_train.dtypes == float).values].tolist() int_cols = X_train.columns[(X_train.dtypes == int).values].tolist() if (X_train[float_cols + int_cols] < 0).astype(int).sum().sum() > 0: model7 = DecisionTreeClassifier(max_depth=5) else: model7 = MultinomialNB() results3 = model7.fit(X_train, y_train).predict(X_test) if not isinstance(y_test, str): metrics3 = accu(results3, y_test).mean() else: metrics3 = 0 estimators.append(('Naive Bayes', model7, metrics3)) if Boosting_Flag: #### If the Boosting_Flag is True, it means Boosting model is present. So choose a Bagging here. model8 = ExtraTreesClassifier(n_estimators=NUMS, min_samples_leaf=2, random_state=seed) results4 = model8.fit(X_train, y_train).predict(X_test) if not isinstance(y_test, str): metrics4 = accu(results4, y_test).mean() else: metrics4 = 0 estimators.append(('Bagging', model8, metrics4)) else: ## Create an ensemble model #### model8 = AdaBoostClassifier(base_estimator=DecisionTreeClassifier( random_state=seed, max_depth=1, min_samples_leaf=2), n_estimators=NUMS, random_state=seed) results4 = model8.fit(X_train, y_train).predict(X_test) if not isinstance(y_test, str): metrics4 = accu(results4, y_test).mean() else: metrics4 = 0 estimators.append(('Boosting', model8, metrics4)) estimators_list = [(tuples[0], tuples[1]) for tuples in estimators] estimator_names = [tuples[0] for tuples in estimators] if not isinstance(y_test, str): if verbose >= 2: print('QuickML_Ensembling Model results:') print( ' %s = %0.4f \n %s = %0.4f\n %s = %0.4f \n %s = %0.4f' % (estimator_names[0], metrics1, estimator_names[1], metrics2, estimator_names[2], metrics3, estimator_names[3], metrics4)) else: if verbose >= 1: print('QuickML_Ensembling completed.') stacks = np.c_[results1, results2, results3, results4] if verbose == 1: print(' Time taken for Ensembling: %0.1f seconds' % (time.time() - start_time)) return estimator_names, stacks #########################################################
def bagging_LassoLarsCV(X, Y, vrbl_names, n_estimators, p_smpl, n_jobs, max_n_estimators): from sklearn.model_selection import KFold, RepeatedKFold from sklearn.ensemble import BaggingRegressor from sklearn.linear_model import LassoLarsCV, LinearRegression cv = KFold(n_splits=5, shuffle=True) try: X = X.values except: pass try: Y = Y.values except: pass X = np.squeeze(X) Y = np.squeeze(Y) max_feats = int(X.shape[1]/3) eps = 2e-10 fitted_ensemble = BaggingRegressor( base_estimator=LassoLarsCV(cv=cv, eps=eps, max_iter=200, n_jobs=1), #base_estimator=LinearRegression(n_jobs=1), n_estimators=max_n_estimators, # Number of fittings max_samples=0.5, # Select 50% of training data per random sample max_features=max_feats, # Select N/3 variables randomly bootstrap=False, # bootstrap_features=False, oob_score=False, n_jobs=n_jobs, #8, random_state=70, verbose=1).fit(X, Y) all_sample_indices = np.arange(X.shape[0]) feature_indices = fitted_ensemble.estimators_features_ sample_indices = fitted_ensemble.estimators_samples_ outofs_indices = [] for i,smp in enumerate(sample_indices): out_sample = all_sample_indices[~np.isin(all_sample_indices, smp)] outofs_indices.append(out_sample) final_ensemble = [] for i, estimator in enumerate(fitted_ensemble.estimators_): f_indices = feature_indices[i] s_indices = sample_indices[i] o_indices = outofs_indices[i] a_indices = all_sample_indices true_indices = np.abs(estimator.coef_)>0 # Definition of success in fitting: at least one predictor # needs to be found if(true_indices.sum() > 0): estimator_predictors = vrbl_names[f_indices][true_indices] n_predictors = true_indices.sum() all_sample_score = calc_corr(Y[a_indices], estimator.predict(X[a_indices][:, f_indices])) # Append results and fitted models to the result list final_ensemble.append([estimator, estimator_predictors, f_indices, n_predictors, all_sample_score]) return final_ensemble
'silent': 1 } # NOTE: Make sure that the class is labeled 'class' in the data file dtrain = xgb.DMatrix(train.drop('y', axis=1), y_train) dtest = xgb.DMatrix(test) num_boost_rounds = 1250 # train model model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds) y_pred = model.predict(dtest) '''Train the stacked models then predict the test data''' exported_pipeline = make_pipeline( StackingEstimator(estimator=LassoLarsCV(normalize=True)), StackingEstimator( estimator=GradientBoostingRegressor(learning_rate=0.00900000000000005, loss="huber", max_depth=6, max_features=0.69000000000000005, min_samples_leaf=16, min_samples_split=14, subsample=0.8000000000001)), StackingEstimator(DecisionTreeRegressor(max_depth=4, min_samples_leaf=6, min_samples_split=13)) ) exported_pipeline.fit(finaltrainset, y_train) results = exported_pipeline.predict(finaltestset) '''R2 Score on the entire Train data when averaging''' print('R2 score on train data:') print(r2_score(y_train, exported_pipeline.predict(finaltrainset) * 0.2855 + model.predict(dtrain) * 0.7145)) '''Average the preditionon test data of both models then save it on a csv file'''
np.sum(lasso_lars.coef_ != 0) lasso_lars = grid.best_estimator_ plt.scatter(range(X_poly.shape[1]), lasso_lars.coef_, c=np.sign(lasso_lars.coef_), cmap="bwr_r") ######## Yellowbrick from yellowbrick.regressor import AlphaSelection, ResidualsPlot, PredictionError from sklearn.linear_model import LassoLarsCV ### Find optimal alpha lassolars_yb = AlphaSelection(LassoLarsCV()) lassolars_yb.fit(X, y) lassolars_yb.poof() ### RVF plot lasso_yb = ResidualsPlot(lasso_lars, hist=True) lasso_yb.fit(X_train, y_train) lasso_yb.score(X_test, y_test) lasso_yb.poof() ### Prediction Error lasso_yb = PredictionError(lasso_lars, hist=True) lasso_yb.fit(X_train, y_train) lasso_yb.score(X_test, y_test)
predictors['x284_2012'] = preprocessing.scale( predictors['x284_2012'].astype('float64')) #check if predictors for lasso regression are standardized and have mean=0 and #sd=1 the means, standard deviation for stats in predictors: print(predictors[stats].describe()) # split data into train and test sets pred_train, pred_test, tar_train, tar_test = train_test_split(predictors, target, test_size=.3, random_state=123) # specify the lasso regression model model = LassoLarsCV(cv=10, precompute=False).fit(pred_train, tar_train) # print variable names and regression coefficients dict(zip(predictors.columns, model.coef_)) #dicionary of predictor variables retained in the model dictionaryValues = dict(zip(predictors.columns, model.coef_)) #make copy of predictors to work towards predictors which were retained during the lasso regression predictorsNew = predictors.copy() #list to contain the retained values in Lasso Regression listofVals = [] for predictorsNew, v in dictionaryValues.items(): if v != 0.0: print(v)
shuffle=False) x_train = ml.loc[train_index] y_train = ml_outs.loc[train_index] x_test = ml.loc[test_index] y_test = ml_outs.loc[test_index] # Scale scaler = StandardScaler().fit(x_train) x_train = scaler.transform(x_train) x_test = scaler.transform(x_test) # Implemnent Model linreg = Lars() # Better linreg = LarsCV() # one Better linreg = LassoLarsCV() # Same linreg = LinearRegression() linreg.fit(x_train, y_train) predictions = linreg.predict(x_test) # Plot predictions and y_test plt.figure() plt.plot(predictions, label='Predictions') plt.plot(pd.Series(predictions).rolling(5).mean(), label='rolling predictions') plt.plot(y_test.values, label='Shifted Currencies ( y_test values', color='grey') plt.plot(cu.loc[test_index, currency].values, label='UNSHIFTED') plt.legend() plt.show()