def _train(self): x = self._train_set.features y = self._train_set.outputs x, y = filter_outliers(x, y, n_estimators=200, contamination=0.003) pipe = pipeline.Pipeline([ #('kselect', feature_selection.SelectKBest(feature_selection.f_regression, k=15)), ('expand', preprocessing.PolynomialFeatures()), ('estim', linear_model.LassoLars()) ]) param_grid = [{ 'expand__include_bias': [False], 'expand__degree': [3], 'estim__normalize': [False], 'estim__fit_intercept': [True], 'estim__alpha': [0.313] #'estim__alpha': list(0.01 + 1 * i for i in range(0, 9)) #'estim__l1_ratio': list(0.80 + 0.01 * i for i in range(0, 6)) }] grid = model_selection.GridSearchCV(pipe, cv=3, n_jobs=1, param_grid=param_grid, verbose=1, scoring=metrics.make_scorer( metrics.mean_squared_error, greater_is_better=False)) grid.fit(x, y) print(grid.best_estimator_) print(grid.cv_results_) estim = grid.best_estimator_.named_steps['estim'] coeffs = estim.coef_ polyexp = grid.best_estimator_.named_steps['expand'] f_names = polyexp.get_feature_names(NAMES[1:]) """ for elem in sorted(zip(coeffs, f_names), reverse=True): print(elem) """ self._model = grid.predict
def optimizeSVM(X_norm, y, kFolds=10): clf = pipeline.Pipeline([ ('svc', svm.SVC(kernel='rbf')), ]) # grid search 多参数优化 parameters = { 'svc__gamma': np.logspace(-3, 11, 8, base=2), 'svc__C': np.logspace(-3, 15, 10, base=2), } gs = grid_search.GridSearchCV(clf, parameters, verbose=1, refit=False, cv=kFolds) gs.fit(X_norm, y) return gs.best_params_['svc__gamma'], gs.best_params_[ 'svc__C'], gs.best_score_
def getTransformer(self, **params): # numvars = ["blood_pressure", "cholestoral", "max_heart_rate", "age"] # cateVars = ["cp", "thal"] ct = compose.ColumnTransformer( [ # ("norm", preprocessing.StandardScaler(), self._getIndex(numvars)), # ( # "cate", # preprocessing.OneHotEncoder(handle_unknown="ignore"), # self.getIndex(cateVars), # ), ], remainder="passthrough", ) transformer = pipeline.Pipeline([("norm", ct)]) transformer.set_params(**params) return transformer
def optimizeAdaBoost(X_norm, y, clf, kFolds=10): clf = pipeline.Pipeline([ ('ada', clf), ]) # grid search 多参数优化 parameters = { # 'ada__n_estimators': np.logspace(0, 3, 20), 'ada__n_estimators': np.linspace(1, 100, 10, dtype=np.dtype(np.int16)), # 'svc__gamma': np.linspace(0, 50, 20), } gs = grid_search.GridSearchCV(clf, parameters, verbose=1, refit=False, cv=kFolds) gs.fit(X_norm, y) return gs.best_params_['ada__n_estimators'], gs.best_score_
def test__is_pytorch_flow(self): self.sklearn_dummy_model = pipeline.Pipeline( steps=[('imputer', Imputer()), ('estimator', tree.DecisionTreeClassifier())]) self.pytorch_flow = self.extension.model_to_flow( self.pytorch_dummy_model) self.pytorch_flow_external_version = self.extension._is_pytorch_flow( self.pytorch_flow) self.sklearn_flow = SklearnExtension().model_to_flow( self.sklearn_dummy_model) self.sklearn_flow_external_version = self.extension._is_pytorch_flow( self.sklearn_flow) self.assertTrue(self.pytorch_flow_external_version) self.assertFalse(self.sklearn_flow_external_version)
def test_cv(): doctor = strategyGame.strategyGameDoctor() Xdata, ydata = getData() X = doctor.readXdata(Xdata) y = doctor.readydata(ydata) # X, y = preprocess.balanceSample(X, y) transformer = doctor.getTransformer() model = doctor.getModel() pipe = pipeline.Pipeline([("transformer", transformer), ("model", model)]) scores = model_selection.cross_val_score(pipe, X, y, cv=5, scoring="accuracy") scoreMean, scoreStd = scores.mean(), scores.std() print("\nCross Validtion Report") print(f"Baseline Score:{scoreMean:.2f} +/-{scoreStd*1:.2f}")
def build_ensemble_classification_pipeline(): models = [ ("KNN", KNeighborsClassifier()), ("SVM", SVC()), ("RF", RandomForestClassifier()), ("bayes", GaussianNB()), ] final_classifier = LogisticRegression() regressor = StackingClassifier(estimators=models, final_estimator=final_classifier) return pipeline.Pipeline([ ("zero_variance_filter", VarianceThreshold(threshold=0)), ("Lasso", SelectFromModel(Lasso())), ("classifier", regressor), ])
def __init__(self, datasource, classifier): logger.info('Training %s on %s', classifier.__class__.__name__, datasource.__class__.__name__) # datasource yields (label, text) pairs y, X = zip(*datasource) self.name = datasource.__class__.__name__ + ':' + classifier.__class__.__name__ self.pipeline = pipeline.Pipeline([ ('dictionary', feature_extraction.text.CountVectorizer( tokenizer=self.tokenizer)), ('tfidf', feature_extraction.text.TfidfTransformer()), ('classifier', classifier), ]) self.pipeline.fit(X, y)
def construct_tar(data, proxies, drop_all=False): tar = pipeline.Pipeline([('scaler', preprocessing.StandardScaler()), ('pred', TAR(fit_intercept=True, normalize=False))]) estimators = {} for prox in proxies: estimators[f"TAR ({prox})"] = { 'pipe': deepcopy(tar), 'drop_cols': proxies if drop_all else [prox], 'fit_params': { 'pred__A': data['train'][prox], 'pred__nu': data['test'][prox] } } return estimators
def func1(): user = {} for line in fileinput.input("../../data/select/select_a"): mac = line.strip().split(" ")[0] user[mac] = True fileinput.close() docList, classList = [], [] for line in fileinput.input( "../../data/feature/trace_http_statistic_filter_feature_sex"): part = line.strip().split(" ") mac, sex, feat = part[0], int(part[1]), part[2:] # print len(feat) if user.has_key(mac): _list = [] for f in feat: _list.append(float(f)) docList.append(_list) classList.append(sex) fileinput.close() docList, classList = np.array(docList), np.array(classList) min_max_scaler = preprocessing.MinMaxScaler() docList = min_max_scaler.fit_transform(docList) cnt, errorCount = 0, 0 loo = LeaveOneOut(len(classList)) trainingdoc, trainingclass = [], [] for train, test in loo: cnt += 1 print cnt trainingdoc, trainingclass, testingdoc, testingclass = docList[ train], classList[train], docList[test], classList[test] # clf = svm.SVC(kernel='rbf', class_weight='auto') clf = pipeline.Pipeline([ # ('feature_selection', feature_selection.SelectKBest(k=10)), ('feature_selection', svm.LinearSVC(penalty="l1", dual=False)), # ('feature_selection', linear_model.LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight='auto', random_state=None)), ('classification', tree.DecisionTreeClassifier()) # ('classification', ensemble.RandomForestClassifier()) # ('classification', SGDClassifier(loss="hinge", penalty="l2")) # ('classification', svm.SVC(kernel='linear', class_weight='auto')) # ('classification', GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)) ]) clf.fit(trainingdoc, trainingclass) for i in range(len(testingdoc)): if not testingclass[i] == clf.predict(testingdoc[i])[0]: errorCount += 1 print 'the error rate is: ', float(errorCount) / len(classList)
def train_ann(X_train, targets): elm = pipeline.Pipeline([('rhl', RandomLayer(n_hidden=200, activation_func='multiquadric', alpha=0.69)), ('lr', LinearRegression(fit_intercept=False))]) #elmr = GenELMRegressor( hidden_layer = rl ) #elmr = ELMRegressor(n_hidden=98,random_state=0, alpha=0.8) elm.fit(X_train, targets) tr_mse = mean_squared_error(targets, predict(elm, X_train)) print("training mse: ", tr_mse) # save model joblib.dump( elm, 'models/2018/23_06_18/elm/diel_diamond/elm_XX_diel_diamond.pkl') return (elm, tr_mse)
def build_nltk_pipeline(model_params: Dict[str, Any]) -> pipeline.Pipeline: """Build the HMM pipeline. Parameters ---------- model_params : dict Model parameters. Returns ------- pipeline.Pipeline Built pipeline that acts as the model. """ return pipeline.Pipeline([ ("feature_extractor", TokenExtractor()), ("model", NLTKModel(**model_params)), ])
def execute_pipeline(x_data_scaled, y_data, n_pca, n_select_best): # make sure the model knows # 1. Method of dimentional reduction, with nothing reduced # if n-components is not set, # all components are used (http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html) model_pca = decomposition.PCA() # 2. Method of feture selection # the all option bypasses selection, # for use in a parameter search (http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html) model_variable_best = feature_selection.SelectKBest(k=all) # 3. Machine Learning algo to use machine_learner = linear_model.RANSACRegressor() # Package the process process = [('pca', model_pca), ('select_best', model_variable_best), ('ml', machine_learner)] # line it up to exexute pipeline_process = pipeline.Pipeline(process) # search space, where all the process keys can be modified search_space = dict(pca__n_components=n_pca, select_best__k=n_select_best) # number of crossfolding validations to be done n_cv = 10 # tell the model, that there will be some crossfolding to be done, and pass it the pipleine for machine learning model = model_selection.GridSearchCV(pipeline_process, param_grid=search_space, cv=n_cv, n_jobs=-1) # run the model model.fit(x_data_scaled, y_data) # ask for results results = model_selection.cross_val_score(model, x_data_scaled, y_data, n_jobs=-1) print( "Prediction score of the model: %.2f%s (%.5f standard deviation) Fitness" % (results.mean() * 100, "%", results.std()))
def train_wdclassifier_user(training_set: Tuple[np.ndarray, np.ndarray], svmType: str, C: float, gamma: Optional[float]) -> sklearn.svm.SVC: """ Trains an SVM classifier for a user Parameters ---------- training_set: Tuple (x, y) The training set (features and labels). y should have labels -1 and 1 svmType: string ('linear' or 'rbf') The SVM type C: float Regularization for the SVM optimization gamma: float Hyperparameter for the RBF kernel Returns ------- sklearn.svm.SVC: The learned classifier """ assert svmType in ['linear', 'rbf'] train_x = training_set[0] train_y = training_set[1] # Adjust for the skew between positive and negative classes n_genuine = len([x for x in train_y if x == 1]) n_forg = len([x for x in train_y if x == -1]) skew = n_forg / float(n_genuine) # Train the model if svmType == 'rbf': model = sklearn.svm.SVC(C=C, gamma=gamma, class_weight={1: skew}) else: model = sklearn.svm.SVC(kernel='linear', C=C, class_weight={1: skew}) model_with_scaler = pipeline.Pipeline([('scaler', preprocessing.StandardScaler(with_mean=False)), ('classifier', model)]) model_with_scaler.fit(train_x, train_y) return model_with_scaler
def prediction_move(): mac_map, X, y = {}, [], [] for line in fileinput.input("_jaccount/mac_user_info.txt"): # mac, val = line.strip().split("\t")[0], line.strip().split("\t")[3].split(" ")[1] # if val in ["男性","女性"]: # mac_map[mac] = 0 if val == "男性" else 1 mac, val = line.strip().split("\t")[0], line.strip().split( "\t")[3].split(" ")[2] if val in ["教职员", "研究生", "本科生"]: mac_map[mac] = 0 if val == "教职员" else 1 if val == "研究生" else 2 fileinput.close() for line in gzip.open("_feature/move_vector.txt.gz"): mac = line.strip().split(" ")[0] if mac_map.has_key(mac): X.append([int(i) for i in line.strip().split(" ")[1:]]) y.append(mac_map[mac]) from sklearn import pipeline from sklearn import linear_model from sklearn import svm from sklearn import tree from sklearn import ensemble from sklearn.cross_validation import KFold from sklearn.cross_validation import LeaveOneOut total, right = 0, 0 X, y = np.array(X), np.array(y) loo = LeaveOneOut(len(X)) print len(y) for train, test in loo: clf = pipeline.Pipeline([ ('feature_selection', linear_model.LogisticRegression(penalty='l1')), # ('feature_selection', svm.LinearSVC(penalty="l1",dual=False)), # ('classification', svm.SVC()) # ('classification', svm.LinearSVC()) # ('classification', tree.DecisionTreeClassifier()) ('classification', ensemble.RandomForestClassifier()) # ('classification', ensemble.GradientBoostingClassifier()) ]) clf = clf.fit(X[train], y[train]) r = clf.predict(X[test])[0] print r, y[test] if r == y[test]: right += 1 total += 1 print float(right) / total
def do(self, n_pts): """ Extract the model using a linear classifier over an approximate feature map of an RBF-kernel. with n pairs of points on the decision boundary of the ATTACKED MODEL. :param n_pts: :return: """ # Collect n pairs of points on the decision boundary of the oracle # WTF ?! We expected the contrary. X, y = self.collect_pts(n_pts) print 'done collecting points' rbf_map = RBFSampler(n_components=n_pts, random_state=1) solver = HyperSolver(p=self.POS, n=self.NEG) rbf_solver = pipeline.Pipeline([("mapper", rbf_map), ("solver", solver)]) gamma_range = np.logspace(-15, 6, 22, base=2) param_grid = dict(mapper__gamma=gamma_range) cv = StratifiedShuffleSplit(y, n_iter=5, test_size=0.2, random_state=1) grid = GridSearchCV(rbf_solver, param_grid=param_grid, cv=cv, n_jobs=8) grid.fit(X, y) scores = [x[1] for x in grid.grid_scores_] scores = np.array(scores).reshape(len(gamma_range)) plt.figure(figsize=(8, 6)) plt.plot(gamma_range, scores) plt.xlabel('gamma') plt.ylabel('score') plt.title('Validation accuracy (RTiX, %s)' % os.path.basename(self.name)) plt.savefig(self.name + '-SLViF-grid-npts=%d.pdf' % n_pts) # final train g = grid.best_params_['mapper__gamma'] print 'best parameters are g=%f' % g rbf_svc2 = grid.best_estimator_ y_pred = rbf_svc2.predict(self.Xt) print 'SCORE: %f' % sm.accuracy_score(self.Yt, y_pred) return grid.best_score_, sm.accuracy_score(self.Yt, y_pred)
def construct_xtar(data, proxies, drop_all=False): xtar = pipeline.Pipeline([('scaler', preprocessing.StandardScaler()), ('pred', XTAR(fit_intercept=True, normalize=False))]) estimators = {} for prox_combo in it.permutations(proxies, 2): estimators[f"xTAR ({prox_combo[0]}, {prox_combo[1]})"] = { 'pipe': deepcopy(xtar), 'drop_cols': proxies if drop_all else [p for p in prox_combo], 'fit_params': { 'pred__W': data['train'][prox_combo[0]], 'pred__Z': data['train'][prox_combo[1]], 'pred__nu': data['test'][prox_combo[0]] } } return estimators
def build_ensemble_regression_pipeline(dimensionality_reduction_name): models = [ ("KNN", KNeighborsRegressor()), ("SVR", SVR()), ("RF", RandomForestRegressor()), ] regressor = WeightedAverageEnsemble(estimators=models) if dimensionality_reduction_name == "lasso": dimensionality_reduction = SelectFromModel(Lasso(), threshold="median") elif dimensionality_reduction_name == "PCA": dimensionality_reduction = PCA(n_components=0.75, svd_solver="full") return pipeline.Pipeline([ ("zero_variance_filter", VarianceThreshold(threshold=0)), (dimensionality_reduction_name, dimensionality_reduction), ("classifier", regressor), ])
def pre_processing(data_sets: tuple, threshold=0.5, impute_strategy="median"): # merge multi data sets new_data = np.concatenate(data_sets, axis=1) # remove all zeros column and row new_data = new_data[~np.all(new_data == 0, axis=1), :] new_data = new_data[:, ~np.all(new_data == 0, axis=0)] # remove all column and row contain nan higher than threshold # new_data = new_data[np.isnan(new_data).sum(axis=1) < threshold * new_data[:, :-1].shape[1], :] # new_data = new_data[:, np.isnan(new_data).sum(axis=0) < threshold * new_data[:, :-1].shape[0]] # impute missing value and standard scale data_preprocess = pipeline.Pipeline([ ('imputer', impute.SimpleImputer(strategy=impute_strategy)), ('std_scaler', preprocessing.StandardScaler()), ]) data_processed = data_preprocess.fit_transform(new_data[:, :-1]) data_processed = np.concatenate((data_processed, new_data[:, [-1]]), axis=1) return data_processed
def button(self): print('x') impute_ = impute.SimpleImputer() scaler = preprocessing.StandardScaler() scaler2 = preprocessing.MinMaxScaler() pipe = pipeline.Pipeline([('impute', impute_), ('scaler', scaler), ('scaler2', scaler2)]) X_data = self.dataframe.values new = pipe.fit_transform(X_data) print(new) print(type(new)) self.emit_signal_for_table(new)
def __init__(self, model, model_name="unknown model"): ''' Build tradition method class :param file: csv file :param model_name: string :param model: sckit learn model ''' self.model = pipeline.Pipeline([ ('counts', feature_extraction.text.CountVectorizer(min_df=5, stop_words="english", analyzer="word", ngram_range=(1, 2))), ('tfidf', feature_extraction.text.TfidfTransformer()), (model_name, model), ]) # self.data = data self.model_name = model_name
def convert_creme_to_sklearn(estimator): """Wraps a creme estimator to make it compatible with scikit-learn.""" if isinstance(estimator, compose.Pipeline): return pipeline.Pipeline([(name, convert_creme_to_sklearn(step)) for name, step in estimator.items()]) wrappers = [(base.BinaryClassifier, SKLClassifierWrapper), (base.Clusterer, SKLClustererWrapper), (base.MultiClassClassifier, SKLClassifierWrapper), (base.Regressor, SKLRegressorWrapper), (base.Transformer, SKLTransformerWrapper)] for base_type, wrapper in wrappers: if isinstance(estimator, base_type): return wrapper(copy.deepcopy(estimator)) raise ValueError("Couldn't find an appropriate wrapper")
def train(): data_directory = 'data_i2r' user = '******' (train, y_train) = read_edf_data.load_data(data_directory, user, 'DataTraining', True) (test, y_test) = read_edf_data.load_data(data_directory, user, 'DataTraining', False) pipe = pipeline.Pipeline([('csp', CSP()), ('chan_var', ChanVar()), ('svm', svm.SVC(kernel='linear'))]) # train model pipe.fit(train, y_train) # make predictions on unseen test data y_pred = pipe.predict(test) print metrics.classification_report(y_test, y_pred)
def train(self, train_instances, train_labels): print("n_train_instances: ", len(train_instances)) print("classifier: ", self.clsf_name) t0 = time() model = skpipeline.Pipeline([('features', self.feature_pipelines), ('clsf', self.classifier)]) print(model.get_params()) print("Start training\n..") model.fit(train_instances, train_labels) t1 = time() print("Traning took ", round(t1 - t0, 2), "sec.") return model
class categoryTransformer(sk.base.BaseEstimator, sk.base.TransformerMixin): def __init__(self, colname): self.colname = colname pass def fit(self, X, y=None): return self def transform(self, X): self.val = X[self.colname] return self.val #-------------------------------------------------------------------- #def build_model(): #load data f = open('./data/df_data.p', 'r') df = pickle.load(f) f.close() #change data-type of prices and eliminate unreasonable values df['price'] = df['price'].str[1:].astype(int) df = df[df['price'] <= 10000] #split dataframe into predictors and outcomes y = df['price'] X = df.drop('price',1) #build model fullModel = pipeline.Pipeline([("col_select", categoryTransformer(['sqft','br','ba'])), ("imp",Imputer(missing_values='NaN', strategy='median', axis=0)), ("lin",LinearRegression()) ]) #cross validate param_grid_pipeline = {'lin__fit_intercept':[True,False], 'lin__normalize':[True,False]} grid = GridSearchCV(fullModel, param_grid_pipeline, cv = 5, n_jobs = -1, verbose=1, scoring = 'mean_squared_error') grid.fit(X, y) #save model f = open('./data/model.p', 'w') pickle.dump(grid, f) f.close()
def _train(self): x = self._train_set.features y = self._train_set.outputs self._transform = pipeline.Pipeline([ #('scale', preprocessing.StandardScaler()), ('proliferate', preprocessing.PolynomialFeatures(3)), ('pselect', feature_selection.SelectPercentile(feature_selection.f_regression, percentile=98)), #('kselect', feature_selection.SelectKBest(feature_selection.f_regression, k=750)), ]) clf = linear_model.Ridge( alpha=500, fit_intercept=True, ) clf.fit(self._transform.fit_transform(x, y), y) self._model = clf.predict
def build_pipeline(model_params: Dict[Any, Any]) -> pipeline.Pipeline: """Return a pipeline that can be used end-to-end with tokenized data. Parameters ---------- model_params : dict Parameters that should be used to initialize the model. Returns ------- pipeline.Pipeline Built pipeline that can be used as a model to fit and predict. """ return pipeline.Pipeline( [ ("feature_extractor", FeatureExtractor()), ("model", Model(**model_params)), ] )
def _detect_topics(model, instances, lang, N=20, stopword=True, stemming=True, remove_numbers=True, deasciify=True, remove_punkt=True, lowercase=True, wordngramrange=(1, 1)): ndim = 1 nmaxfeature = 200 preprocessor = prep.Preprocessor(lang=lang, stopword=stopword, more_stopwords=None, spellcheck=False, stemming=stemming, remove_numbers=remove_numbers, deasciify=deasciify, remove_punkt=remove_punkt, lowercase=lowercase) tfidfvect = sktext.TfidfVectorizer(tokenizer=prep.identity, preprocessor=None, lowercase=False, use_idf=True, ngram_range=wordngramrange, max_features=nmaxfeature) topical_transformer = skpipeline.Pipeline([ ('txtprep', preprocessor), ('tfidf_vect', tfidfvect), #('normalizer', skprep.Normalizer()), ('scaler', skprep.StandardScaler(with_mean=False)), ('nmf', model) ]) topical_transformer.fit(instances) return utilstopics.get_topic_words(model, tfidfvect, N)
def get_pipeline(numberestimators, minsamplesleaf): classifier = get_estimator(numberestimators, minsamplesleaf) feature_columns = metadata.FEATURE_COLUMNS numerical_names = metadata.NUMERIC_FEATURES categorical_names = metadata.CATEGORICAL_FEATURES preprocessor = preprocess_utils.get_preprocess_pipeline( feature_columns=feature_columns, numerical_names=numerical_names, categorical_names=categorical_names) estimator = pipeline.Pipeline([ ('preprocessor', preprocessor), ('classifier', classifier), ]) return estimator
def get_predictions(X, y, X_test, X_cv): # Initialize SVD svd = TruncatedSVD() scl = StandardScaler() clf_model = SVC() clf = pipeline.Pipeline([('svd', svd), ('scl', scl), ('clf', clf_model)]) # Create a parameter grid to search for best parameters for everything in the pipeline param_grid = {'svd__n_components': [400], 'clf__C': [12]} # Kappa Scorer kappa_scorer = metrics.make_scorer(quadratic_weighted_kappa, greater_is_better=True) # Initialize Grid Search Model model = grid_search.GridSearchCV(estimator=clf, param_grid=param_grid, scoring=kappa_scorer, verbose=10, n_jobs=-1, iid=True, refit=True, cv=2) # Fit Grid Search Model model.fit(X, y) print("Best score: %0.3f" % model.best_score_) print("Best parameters set:") best_parameters = model.best_estimator_.get_params() for param_name in sorted(param_grid.keys()): print("\t%s: %r" % (param_name, best_parameters[param_name])) # Get best model best_model = model.best_estimator_ # Fit model with best parameters optimized for quadratic_weighted_kappa best_model.fit(X, y) preds = best_model.predict(X_test) preds_cv = best_model.predict(X_cv) return (preds, preds_cv)