def test_bagging_classifier_with_missing_inputs(): # Check that BaggingClassifier can accept X with missing/infinite data X = np.array([ [1, 3, 5], [2, None, 6], [2, np.nan, 6], [2, np.inf, 6], [2, np.NINF, 6], ]) y = np.array([3, 6, 6, 6, 6]) classifier = DecisionTreeClassifier() pipeline = make_pipeline( FunctionTransformer(replace, validate=False), classifier ) pipeline.fit(X, y).predict(X) bagging_classifier = BaggingClassifier(pipeline) bagging_classifier.fit(X, y) y_hat = bagging_classifier.predict(X) assert_equal(y.shape, y_hat.shape) bagging_classifier.predict_log_proba(X) bagging_classifier.predict_proba(X) # Verify that exceptions can be raised by wrapper classifier classifier = DecisionTreeClassifier() pipeline = make_pipeline(classifier) assert_raises(ValueError, pipeline.fit, X, y) bagging_classifier = BaggingClassifier(pipeline) assert_raises(ValueError, bagging_classifier.fit, X, y)
def preprocess(self,any_set,is_train): if is_train: dico_pattern={'match_lowercase_only':'\\b[a-z]+\\b', 'match_word':'\\w{2,}', 'match_word1': '(?u)\\b\\w+\\b', 'match_word_punct': '\w+|[,.?!;]', 'match_NNP': '\\b[A-Z][a-z]+\\b|\\b[A-Z]+\\b', 'match_punct': "[,.?!;'-]" } tfv_title = TfidfVectorizer(lowercase=True, stop_words='english', token_pattern=dico_pattern["match_word1"], ngram_range=(1, 2), max_df=1.0, min_df=2, max_features=None, vocabulary=None, binary=True, norm=u'l2', use_idf=True, smooth_idf=True, sublinear_tf=True) tfv_desc = TfidfVectorizer(lowercase=True, stop_words='english', token_pattern=dico_pattern["match_word1"], ngram_range=(1, 2), max_df=1.0, min_df=2, max_features=None, vocabulary=None, binary=True, norm=u'l2', use_idf=True, smooth_idf=True, sublinear_tf=True) title_pipe = make_pipeline(ColumnSelector(key='title'), tfv_title) desc_pipe = make_pipeline(ColumnSelector(key='description'), tfv_desc) self.pipeline = make_union(title_pipe, desc_pipe) return self.pipeline.fit_transform(any_set) else: return self.pipeline.transform(any_set)
def test_pipeline_ducktyping(): pipeline = make_pipeline(Mult(5)) pipeline.predict pipeline.transform pipeline.inverse_transform pipeline = make_pipeline(Transf()) assert_false(hasattr(pipeline, 'predict')) pipeline.transform pipeline.inverse_transform pipeline = make_pipeline(None) assert_false(hasattr(pipeline, 'predict')) pipeline.transform pipeline.inverse_transform pipeline = make_pipeline(Transf(), NoInvTransf()) assert_false(hasattr(pipeline, 'predict')) pipeline.transform assert_false(hasattr(pipeline, 'inverse_transform')) pipeline = make_pipeline(NoInvTransf(), Transf()) assert_false(hasattr(pipeline, 'predict')) pipeline.transform assert_false(hasattr(pipeline, 'inverse_transform'))
def get_pipeline(fsmethods, clfmethod): """Returns an instance of a sklearn Pipeline given the parameters fsmethod1 and fsmethod2 will be joined in a FeatureUnion, then it will joined in a Pipeline with clfmethod Parameters ---------- fsmethods: list of estimators All estimators in a pipeline, must be transformers (i.e. must have a transform method). clfmethod: classifier The last estimator may be any type (transformer, classifier, etc.). Returns ------- pipe """ feat_union = None if not isinstance(fsmethods, list): if hasattr(fsmethods, 'transform'): feat_union = fsmethods else: raise ValueError('fsmethods expected to be either a list or a transformer method') else: feat_union = make_union(*fsmethods) if feat_union is None: pipe = make_pipeline(clfmethod) else: pipe = make_pipeline(feat_union, clfmethod) return pipe
def test_pipeline_ducktyping(): pipeline = make_pipeline(Mult(5)) pipeline.predict pipeline.transform pipeline.inverse_transform pipeline = make_pipeline(Transf()) assert not hasattr(pipeline, 'predict') pipeline.transform pipeline.inverse_transform pipeline = make_pipeline('passthrough') assert pipeline.steps[0] == ('passthrough', 'passthrough') assert not hasattr(pipeline, 'predict') pipeline.transform pipeline.inverse_transform pipeline = make_pipeline(Transf(), NoInvTransf()) assert not hasattr(pipeline, 'predict') pipeline.transform assert not hasattr(pipeline, 'inverse_transform') pipeline = make_pipeline(NoInvTransf(), Transf()) assert not hasattr(pipeline, 'predict') pipeline.transform assert not hasattr(pipeline, 'inverse_transform')
def __init__(self, **config): # Validate options are present for option in _configuration_options: if option not in config: raise ValueError("Missing configuration " "option {!r}".format(option)) # Feature extraction sparse_features = parse_features(config["sparse_features"]) densifier = make_pipeline(Vectorizer(sparse_features, sparse=True), ClassifierAsFeature()) dense_features = parse_features(config["dense_features"]) vectorization = make_union(densifier, Vectorizer(dense_features, sparse=False)) # Classifier try: classifier = _valid_classifiers[config["classifier"]] except KeyError: raise ValueError("Unknown classification algorithm " "{!r}".format(config["classifier"])) classifier = classifier(**config["classifier_args"]) self.pipeline = make_pipeline(vectorization, StandardScaler()) self.classifier = classifier
def analysis(name, typ, condition=None, query=None, title=None): """Wrapper to ensure that we attribute the same function for each type of analyses: e.g. categorical, regression, circular regression.""" # Define univariate analysis erf_function = None # Default is fast_mannwhitneyu # /!\ for categorical analyses, the contrast is min(y) - max(y) # e.g. target_present==False - target_present==True if typ == 'categorize': # estimator is normalization + l2 Logistic Regression clf = make_pipeline( StandardScaler(), force_predict(LogisticRegression(class_weight='balanced'), axis=1)) scorer = scorer_auc chance = .5 elif typ == 'regress': # estimator is normalization + l2 Ridge clf = make_pipeline(StandardScaler(), Ridge()) scorer = scorer_spearman chance = 0. elif typ == 'circ_regress': # estimator is normalization + l2 Logistic Regression on cos and sin clf = make_pipeline(StandardScaler(), PolarRegression(Ridge())) scorer = scorer_angle chance = 0. # The univariate analysis needs a different scorer erf_function = scorer_circlin if condition is None: condition = name return dict(name=name, condition=condition, query=query, clf=clf, scorer=scorer, chance=chance, erf_function=erf_function, cv=8, typ=typ, title=title, single_trial=True)
def main(met_fname, gday_outfname, var): # Load met data s = remove_comments_from_header(met_fname) df_met = pd.read_csv(s, parse_dates=[[0,1]], skiprows=4, index_col=0, sep=",", keep_date_col=True, date_parser=date_converter) # Need to build numpy array, so drop year, doy cols met_data = df_met.ix[:,2:].values met_data_train = df_met.ix[0:4000,2:].values # Load GDAY outputs df = pd.read_csv(gday_outfname, skiprows=3, sep=",", skipinitialspace=True) df['date'] = make_data_index(df) df = df.set_index('date') target = df[var][0:4000].values # BUILD MODELS # hold back 40% of the dataset for testing #X_train, X_test, Y_train, Y_test = \ # cross_validation.train_test_split(met_data, target, \ # test_size=0.4, random_state=0) param_KNR = { "n_neighbors": [20], "weights": ['distance'] } #regmod = DecisionTreeRegressor() #regmod = RandomForestRegressor() #regmod = SVR() regmod = KNeighborsRegressor() pipeit3 = lambda model: make_pipeline(StandardScaler(), PCA(), model) pipeit2 = lambda model: make_pipeline(StandardScaler(), model) regmod_p = pipeit2(regmod) modlab = regmod_p.steps[-1][0] par_grid = {'{0}__{1}'.format(modlab, parkey): pardat \ for (parkey, pardat) in param_KNR.iteritems()} #emulator = GridSearchCV(regmod, param_grid=param_DTR, cv=5) emulator = GridSearchCV(regmod_p, param_grid=par_grid, cv=5) #emulator.fit(X_train, Y_train) emulator.fit(met_data_train, target) predict = emulator.predict(met_data) df = pd.DataFrame({'DT': df.index, 'emu': predict, 'gday': df[var]}) plt.plot_date(df.index[4000:4383], df['emu'][4000:4383], 'o', label='Emulator') plt.plot_date(df.index[4000:4383], df['gday'][4000:4383], 'o', label='GDAY') plt.ylabel('GPP (g C m$^{-2}$ s$^{-1}$)') plt.legend() plt.show()
def cross_validation_LR(X,Y, n_folds, C_seq, K_seq, verbose = False): ''' To classify Y using X, we first use ANOVA to choose K dimensions in X, where the difference between different Ys are highest, then run a logistic regression classifier with regularization parameter C on the K dimensions. To quantify how well X can classify Y, without specifying training and testing partition, we do n_folds cross validation. In each fold, during training, we do an inner loop cross validation to select C and K that give the best classification accuracy from a given range; and then we use this to classify the held-out testing data. Inputs: X, [n, p], n trials of p dimensional data, used for classification Y, [n], class labels n_folds,integer, split the data into n_folds for cross validation C_seq, a sequence of regularizatioin parameters for logistic regression classifiers, smaller values specify stronger regularization. e.g. C_seq = 10.0** np.arange(-3,1,1) K_seq, a sequence of integers, e.g. K_seq = (np.floor(np.arange(0.2,1,0.2)*p)).astype(np.int) verbose: boolean, if ture, print the best C and K chosen Output: averaged classification accuracy of the n_folds ''' cv0 = StratifiedKFold(Y,n_folds = n_folds) cv_acc = np.zeros(n_folds) for i in range(n_folds): ind_test = cv0.test_folds == i ind_train = cv0.test_folds != i tmpX_train = X[ind_train,:] tmpY_train = Y[ind_train] tmpX_test = X[ind_test,:] tmpY_test = Y[ind_test] # grid search tmp_cv_score = np.zeros([len(C_seq), len(K_seq)]) for j in range(len(C_seq)): for k in range(len(K_seq)): cv1 = StratifiedKFold(tmpY_train,n_folds = n_folds) anova_filter = SelectKBest(f_regression, k = K_seq[k]) clf = LogisticRegression(C = C_seq[j], penalty = "l2") anova_clf = make_pipeline(anova_filter, clf) tmp_cv_score[j,k] = cross_val_score(anova_clf, tmpX_train, tmpY_train, scoring = "accuracy", cv = cv1).mean() best_ind = np.argmax(tmp_cv_score.ravel()) best_j, best_k = np.unravel_index(best_ind, tmp_cv_score.shape) anova_filter = SelectKBest(f_regression, k = K_seq[k]) clf = LogisticRegression(C = C_seq[j], penalty = "l2") anova_clf = make_pipeline(anova_filter, clf) tmpY_predict = anova_clf.fit(tmpX_train, tmpY_train).predict(tmpX_test) if verbose: print C_seq[best_j],K_seq[best_k] cv_acc[i] = np.mean(tmpY_test == tmpY_predict) return np.mean(cv_acc)
def fit(self, X, y): # Filthy hack sids = X[:, -1] all_pipelines = [make_pipeline(LogisticRegressionCV()).fit(X_s, y_s) for X_s, y_s in subject_splitter(X[:, :-1], y, sids)] f_union = make_union(*[FeatureUnionWrapper(p) for p in all_pipelines]) self.clf_ = make_pipeline(f_union, LogisticRegressionCV()).fit(X[:, :-1], y) return self
def test_generator_ok(self): pipeline = make_pipeline(FakeGenerator(fakes=['job', 'name', 'address'], nb_sample=20, random_state=40)) result = pipeline.fit_transform(None) self.assertEqual(result.shape, (20, 3)) pipeline = make_pipeline(FakeGenerator(fakes=['job', 'name', 'address'], nb_sample=20, random_state=40)) result_2 = pipeline.fit_transform(None) # Testing the seed assert_frame_equal(result, result_2)
def test_make_pipeline_memory(): cachedir = mkdtemp() memory = Memory(cachedir=cachedir) pipeline = make_pipeline(DummyTransf(), SVC(), memory=memory) assert_true(pipeline.memory is memory) pipeline = make_pipeline(DummyTransf(), SVC()) assert_true(pipeline.memory is None) shutil.rmtree(cachedir)
def test_generator_ok(self): pipeline = make_pipeline( TestGenerator(nb_sample=100, random_state=40, num_sample=(1, 3), categ_sample=['green', 'blue'])) result = pipeline.fit_transform(None) self.assertEqual(result.shape, (100, 2)) self.assertEqual(result['number'].min(), 1) self.assertEqual(result['number'].max(), 2) pipeline = make_pipeline( TestGenerator(nb_sample=100, random_state=40, num_sample=(1, 3), categ_sample=['green', 'blue'])) result_2 = pipeline.fit_transform(None) # Testing the seed assert_frame_equal(result, result_2)
def test_classes_property(): iris = load_iris() X = iris.data y = iris.target reg = make_pipeline(SelectKBest(k=1), LinearRegression()) reg.fit(X, y) assert_raises(AttributeError, getattr, reg, "classes_") clf = make_pipeline(SelectKBest(k=1), LogisticRegression(random_state=0)) assert_raises(AttributeError, getattr, clf, "classes_") clf.fit(X, y) assert_array_equal(clf.classes_, np.unique(y))
def test_make_pipeline(): t1 = Transf() t2 = Transf() pipe = make_pipeline(t1, t2) assert_true(isinstance(pipe, Pipeline)) assert_equal(pipe.steps[0][0], "transf-1") assert_equal(pipe.steps[1][0], "transf-2") pipe = make_pipeline(t1, t2, FitParamT()) assert_true(isinstance(pipe, Pipeline)) assert_equal(pipe.steps[0][0], "transf-1") assert_equal(pipe.steps[1][0], "transf-2") assert_equal(pipe.steps[2][0], "fitparamt")
def test_make_pipeline_memory(): cachedir = mkdtemp() if LooseVersion(joblib_version) < LooseVersion('0.12'): # Deal with change of API in joblib memory = Memory(cachedir=cachedir, verbose=10) else: memory = Memory(location=cachedir, verbose=10) pipeline = make_pipeline(DummyTransf(), SVC(), memory=memory) assert_true(pipeline.memory is memory) pipeline = make_pipeline(DummyTransf(), SVC()) assert_true(pipeline.memory is None) shutil.rmtree(cachedir)
def __init__(self): self.clf1 = [make_pipeline(Imputer(), GradientBoostingRegressor(n_estimators=5000, max_depth=8)) for _ in range(5)] self.clf2 = [make_pipeline(Imputer(strategy='median'), ExtraTreesRegressor(n_estimators=5000, criterion='mse', max_depth=8, min_samples_split=10, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, bootstrap=False, oob_score=False, n_jobs=1, random_state=42, verbose=0, warm_start=True)) for _ in range(5)] self.clf3 = [make_pipeline(Imputer(), svm.LinearSVR()) for _ in range(5)] self.clf = [linear_model.LinearRegression() for _ in range(5)]
def get_results(dataset): X_full, y_full = dataset.data, dataset.target n_samples = X_full.shape[0] n_features = X_full.shape[1] # Estimate the score on the entire dataset, with no missing values estimator = RandomForestRegressor(random_state=0, n_estimators=100) full_scores = cross_val_score(estimator, X_full, y_full, scoring='neg_mean_squared_error') # Add missing values in 75% of the lines missing_rate = 0.75 n_missing_samples = int(np.floor(n_samples * missing_rate)) missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples, dtype=np.bool), np.ones(n_missing_samples, dtype=np.bool))) rng.shuffle(missing_samples) missing_features = rng.randint(0, n_features, n_missing_samples) # Estimate the score after replacing missing values by 0 X_missing = X_full.copy() X_missing[np.where(missing_samples)[0], missing_features] = 0 y_missing = y_full.copy() estimator = RandomForestRegressor(random_state=0, n_estimators=100) zero_impute_scores = cross_val_score(estimator, X_missing, y_missing, scoring='neg_mean_squared_error') # Estimate the score after imputation (mean strategy) of the missing values X_missing = X_full.copy() X_missing[np.where(missing_samples)[0], missing_features] = 0 y_missing = y_full.copy() estimator = make_pipeline( make_union(SimpleImputer(missing_values=0, strategy="mean"), MissingIndicator(missing_values=0)), RandomForestRegressor(random_state=0, n_estimators=100)) mean_impute_scores = cross_val_score(estimator, X_missing, y_missing, scoring='neg_mean_squared_error') # Estimate the score after chained imputation of the missing values estimator = make_pipeline( make_union(ChainedImputer(missing_values=0, random_state=0), MissingIndicator(missing_values=0)), RandomForestRegressor(random_state=0, n_estimators=100)) chained_impute_scores = cross_val_score(estimator, X_missing, y_missing, scoring='neg_mean_squared_error') return ((full_scores.mean(), full_scores.std()), (zero_impute_scores.mean(), zero_impute_scores.std()), (mean_impute_scores.mean(), mean_impute_scores.std()), (chained_impute_scores.mean(), chained_impute_scores.std()))
def build_text_extraction(binary, min_df, ngram, stopwords,useTfIdf ): if useTfIdf: return make_pipeline(TfidfVectorizer(min_df=min_df, max_df = 0.8, sublinear_tf=True, use_idf=True, ngram_range=(1,3)), ClassifierOvOAsFeatures()) return make_pipeline(CountVectorizer(binary=binary, tokenizer=lambda x: x.split(), min_df=min_df, ngram_range=(1, ngram), stop_words=stopwords), ClassifierOvOAsFeatures())
def out_fold_pred(params, X, y_array, y_ix, reps): y = y_array[:, y_ix] # cross validation here preds = np.zeros((y_array.shape[0])) clf = make_pipeline(StandardScaler(), LogisticRegression(**params)) for train_ix, test_ix in makeKFold(5, y, reps): X_train, y_train = X[train_ix, :], y[train_ix] X_test = X[test_ix, :] clf = make_pipeline(StandardScaler(), LogisticRegression(**params)) clf.fit(X_train, y_train) pred = clf.predict_proba(X_test)[:, 1] preds[test_ix] = pred return preds
def build_synset_extraction(binary, min_df, ngram, useTfIdf): if useTfIdf: return make_pipeline(MapToSynsets(), TfidfVectorizer(min_df=min_df, max_df = 0.8, sublinear_tf=True, use_idf=True, ngram_range=(1,3)), ClassifierOvOAsFeatures()) return make_pipeline(MapToSynsets(), CountVectorizer(binary=binary, tokenizer=lambda x: x.split(), min_df=min_df, ngram_range=(1, ngram)), ClassifierOvOAsFeatures())
def check_pipeline_consistency(name, Estimator): if name in ('CCA', 'LocallyLinearEmbedding', 'KernelPCA') and _is_32bit(): # Those transformers yield non-deterministic output when executed on # a 32bit Python. The same transformers are stable on 64bit Python. # FIXME: try to isolate a minimalistic reproduction case only depending # scipy and/or maybe generate a test dataset that does not # cause such unstable behaviors. msg = name + ' is non deterministic on 32bit Python' raise SkipTest(msg) # check that make_pipeline(est) gives same score as est X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, n_features=2, cluster_std=0.1) X -= X.min() y = multioutput_estimator_convert_y_2d(name, y) estimator = Estimator() set_fast_parameters(estimator) set_random_state(estimator) pipeline = make_pipeline(estimator) estimator.fit(X, y) pipeline.fit(X, y) funcs = ["score", "fit_transform"] for func_name in funcs: func = getattr(estimator, func_name, None) if func is not None: func_pipeline = getattr(pipeline, func_name) result = func(X, y) result_pipe = func_pipeline(X, y) assert_array_almost_equal(result, result_pipe)
def test_estimators_samples_deterministic(): # This test is a regression test to check that with a random step # (e.g. SparseRandomProjection) and a given random state, the results # generated at fit time can be identically reproduced at a later time using # data saved in object attributes. Check issue #9524 for full discussion. iris = load_iris() X, y = iris.data, iris.target base_pipeline = make_pipeline(SparseRandomProjection(n_components=2), LogisticRegression()) clf = BaggingClassifier(base_estimator=base_pipeline, max_samples=0.5, random_state=0) clf.fit(X, y) pipeline_estimator_coef = clf.estimators_[0].steps[-1][1].coef_.copy() estimator = clf.estimators_[0] estimator_sample = clf.estimators_samples_[0] estimator_feature = clf.estimators_features_[0] X_train = (X[estimator_sample])[:, estimator_feature] y_train = y[estimator_sample] estimator.fit(X_train, y_train) assert_array_equal(estimator.steps[-1][1].coef_, pipeline_estimator_coef)
def scipy_algo(dataset, abstract=False): doc_proc = dp.DocumentsProcessor(dataset) tfidf_matrix, f_score_dict = doc_proc.get_data(abstract) svd = TruncatedSVD(tfidf_matrix.shape[0]) lsa = make_pipeline(svd, Normalizer(copy=False)) #tfidf_matrix = lsa.fit_transform(tfidf_matrix) print 'starting clustering after lsa: found %s document and %s features' \ % (tfidf_matrix.shape[0], tfidf_matrix.shape[1]) linkage_matrix = hr.average(tfidf_matrix.toarray()) #linkage_matrix = hr.average(tfidf_matrix) t = hr.to_tree(linkage_matrix, rd=True) clusters = {} for node in t[1]: if not node.is_leaf(): l = [] clusters[node.get_id()] = collect_leaf_nodes(node, l) f = f_score(clusters, f_score_dict) print_f_score_dict(f) avg_f_score = average_f_score(f, tfidf_matrix.shape[0]) print 'average f_score: %s' % avg_f_score return avg_f_score
def train(param_search=False): data = load_files(download()) y = [data.target_names[t] for t in data.target] # The random state on the LR estimator is fixed to the most arbitrary value # that I could come up with. It is biased toward the middle number keys on # my keyboard. clf = make_pipeline(TfidfVectorizer(min_df=2, dtype=float, sublinear_tf=True, ngram_range=(1, 2), strip_accents='unicode'), LogisticRegression(random_state=623, C=5000)) if param_search: params = {'tfidf__ngram_range': [(1, 1), (1, 2)], 'lr__C': [1000, 5000, 10000]} print("Starting parameter search for review sentiment classification") # We ignore the original folds in the data, preferring a simple 5-fold # CV instead; this is intended to get a working model, not results for # publication. gs = GridSearchCV(clf, params, cv=5, refit=True, n_jobs=-1, verbose=2) gs.fit(data.data, y) print("Parameters found:") pprint(gs.best_params_) print("Cross-validation accuracy: %.3f" % gs.best_score_) return gs.best_estimator_ else: print("Training logistic regression for movie review polarity") return clf.fit(data.data, y)
def cluster_dandelion_2(dataset, gamma=0.91, filter=False): #duplicato, mi serve solo per tornare la linkage_matrix doc_proc = dp.DocumentsProcessor(dataset) if gamma: tfidf_matrix, f_score_dict, params = doc_proc.get_data_with_dandelion( gamma=gamma, filter=filter) else: tfidf_matrix, f_score_dict, params = doc_proc.get_data_with_dandelion() svd = TruncatedSVD(tfidf_matrix.shape[0]) lsa = make_pipeline(svd, Normalizer(copy=False)) tfidf_matrix = lsa.fit_transform(tfidf_matrix) #linkage_matrix = hr.average(tfidf_matrix.toarray()) linkage_matrix = hr.average(tfidf_matrix) t = hr.to_tree(linkage_matrix, rd=True) clusters = {} for node in t[1]: if not node.is_leaf(): l = [] clusters[node.get_id()] = collect_leaf_nodes(node, l) f = f_score(clusters, f_score_dict) l = print_f_score_dict(f) params['avg_f_score'] = average_f_score(f, tfidf_matrix.shape[0]) params['all_fscore'] = l return linkage_matrix
def test_bagging_with_pipeline(): estimator = BaggingClassifier(make_pipeline(SelectKBest(k=1), DecisionTreeClassifier()), max_features=2) estimator.fit(iris.data, iris.target) assert_true(isinstance(estimator[0].steps[-1][1].random_state, int))
def main(): if os.path.exists(args.out_svd_result_matrix): print("Loading SVD matrix from file") X = np.load(args.out_svd_result_matrix) print("Loading corpus") _, file_index = LoadCorpus(args.training_dir) else: print("Loading corpus") corpus, file_index = LoadCorpus(args.training_dir) print("Building TF-IDF") tf_idf = TfidfVectorizer(input="content", lowercase=False) X = tf_idf.fit_transform(corpus) del corpus print("Running LSA") svd = TruncatedSVD(args.dimentionality) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) X = lsa.fit_transform(X) print("Saving SVD results") np.save(args.out_svd_result_matrix, X) if ( os.path.exists(args.out_inv_idx) and os.path.exists(args.out_unique_kmeans_labels) and os.path.exists(args.out_idx) ): print("Loading labels") unique_labels = np.load(args.out_unique_kmeans_labels) inv_idx = np.load(args.out_inv_idx) idx = np.load(args.out_idx) unique_X = X[idx] else: print("Unique matrix") b = np.ascontiguousarray(X).view(np.dtype((np.void, X.dtype.itemsize * X.shape[1]))) _, idx, inv_idx = np.unique(b, return_index=True, return_inverse=True) print("Saving inv_idx") np.save(args.out_inv_idx, inv_idx) print("Saving idx") np.save(args.out_idx, idx) unique_X = X[idx] print("Running K-Means") unique_labels, _ = KMeans(unique_X) print("Save unique K-Means labels") np.save(args.out_unique_kmeans_labels, unique_labels) print("Re-label non-unique") labels = unique_labels[inv_idx] for l in range(unique_labels.max() + 1): out_filename = args.out_unique_distance_matrix_prefix + str(l) + ".npy" if os.path.exists(out_filename): continue print("Calculating distance matrix for label:", l) D = CalcDistances(unique_labels, l, unique_X) print("Saving to distance matrix to file") np.save(out_filename, D) if not os.path.exists(args.out_corpus_index): print("Calculating corpus index") corpus_index = GetCorpusIndex(file_index, labels, unique_labels, inv_idx) print("Saving corpus index") json.dump(corpus_index, open(args.out_corpus_index, "w"))
def get_input_vector(fields, vec_name, data): """Transform the input and create a 2D vector to cluster.""" transformer = create_input_transformer(fields, vec_name) pipeline = make_pipeline(transformer, TruncatedSVD()) log_info('Transformation pipeline complete.') return pipeline.fit_transform(data)
def build_lex_extraction(binary, min_df, ngram): return make_pipeline(InquirerLexTransform(), CountVectorizer(binary=binary, tokenizer=lambda x: x.split(), min_df=min_df, ngram_range=(1, ngram)), Densifier())
# Finally We want to find Data Series of late X and early X (train) for model generation and evaluation X_lately = X[-forecast_out:] X_train = X[:-forecast_out] # Separate label and identify it as y y = np.array(dfreg['label']) y_train = y[:-forecast_out] #Model Generation # Linear regression clfreg = LinearRegression(n_jobs=-1) clfreg.fit(X_train, y_train) # Quadratic Regression 2 clfpoly2 = make_pipeline(PolynomialFeatures(2), Ridge()) clfpoly2.fit(X_train, y_train) # Quadratic Regression 3 clfpoly3 = make_pipeline(PolynomialFeatures(3), Ridge()) clfpoly3.fit(X_train, y_train) # KNN Regression clfknn = KNeighborsRegressor(n_neighbors=2) clfknn.fit(X_train, y_train) #Evaluation #confidencereg = clfreg.score(X_test, y_test) #confidencepoly2 = clfpoly2.score(X_test,y_test) #confidencepoly3 = clfpoly3.score(X_test,y_test) #confidenceknn = clfknn.score(X_test, y_test)
def create_pipeline(search_space): pipeline = make_pipeline( SimpleImputer(**search_space['simpleimputer']), LogisticRegression(solver='liblinear', **search_space['logisticregression'])) return pipeline
# epochs = epochs.apply_baseline(baseline=(-0.2, 0)) ###################### # MVPA # Init Cross-validation instance logo = LeaveOneGroupOut() # Init Xdawn, classifier and time_decoder xdawn = mne.preprocessing.Xdawn(n_components=6, reg='diagonal_fixed') # Init classifier class_weight = 'balanced' # None or 'balanced' lr = LogisticRegression(solver='lbfgs', class_weight=class_weight) svm = svm.SVC(gamma='scale', kernel='rbf', class_weight=class_weight) # Init decoder clf = make_pipeline(StandardScaler(), svm) # Raw 3-D data decoder raw_decoder = make_pipeline(mne.decoding.Vectorizer(), StandardScaler(), clf) # Time resolution data decoder time_decoder = SlidingEstimator(clf, n_jobs=n_jobs, scoring='f1', verbose=1) # Init time information times = epochs.times n_times = len(times) y_true = epochs.events[:, 2] n_samples = len(y_true) # Time window information sfreq = epochs.info['sfreq'] window_info = {} y_pred_timewindow = {} for window_length in [0.1, 0.2, 0.3, 0.4]:
def test_in_pipeline(): from sklearn.pipeline import make_pipeline X, y = make_classification(n_samples=100, n_features=5, chunksize=10) pipe = make_pipeline(DoNothingTransformer(), LogisticRegression()) pipe.fit(X, y)
def train_classifier(input_features, test_features, classifier_type, classifier_output, feat_names): feats = [] labels = [] # Load features and labels and format them as numpy array for line in input_features: parts = line.rstrip("\n").split("\t") feats.append([float(v) for v in parts[:-1]]) labels.append(int(parts[-1])) dataset = dict() dataset['data'] = np.array(feats) dataset['target'] = np.array(labels) # Train classifier if classifier_type == "svm": clf = make_pipeline(MinMaxScaler(), svm.SVC(gamma=0.001, C=100., probability=True)) elif classifier_type == "mlp": clf = MLPClassifier(verbose=True, solver='adam', alpha=1e-5, hidden_layer_sizes=(100, ), random_state=1, shuffle=True, early_stopping=True, validation_fraction=0.1) elif classifier_type == "extra_trees": parameters = { 'criterion': ('gini', 'entropy'), 'n_estimators': [100, 200, 300, 400, 500], } clf = ExtraTreesClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1, oob_score=False, random_state=0, verbose=0, warm_start=False) elif classifier_type == "nn": clf = neighbors.KNeighborsClassifier(n_neighbors=5, n_jobs=-1) elif classifier_type == "nn1": clf = neighbors.KNeighborsClassifier(n_neighbors=1, n_jobs=-1) elif classifier_type == "adaboost": clf = AdaBoostClassifier(n_estimators=100) elif classifier_type == "random_forest": parameters = { 'criterion': ('gini', 'entropy'), 'n_estimators': [100, 200, 300, 400, 500], } clf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1, oob_score=False, random_state=0, verbose=0, warm_start=False) else: logging.error("Unknown classifier: " + classifier_type) sys.exit(1) # If parameters is defined perform grid search try: parameters except NameError: pass else: clf = GridSearchCV(clf, parameters, n_jobs=-1) clf.fit(dataset['data'], dataset['target']) # Log sorted feature importances with their names if classifier_type in ('random_forest', 'adaboost', 'extra_trees'): if isinstance(clf, GridSearchCV): feat_dict = dict( zip(feat_names, clf.best_estimator_.feature_importances_)) else: feat_dict = dict(zip(feat_names, clf.feature_importances_)) logging.info("Top 10 important features: ") sorted_f = sorted(feat_dict.items(), key=lambda item: item[1], reverse=True) for feat in sorted_f[:10]: logging.info("\t{:.5f}: {}".format(feat[1], feat[0])) # Save classifier and log best params found by grid search if isinstance(clf, GridSearchCV): joblib.dump(clf.best_estimator_, classifier_output, compress=3) logging.info('Best classifier parameters found:') for k, v in clf.best_params_.items(): logging.info('\t {}: {}'.format(k, v)) else: joblib.dump(clf, classifier_output, compress=3) feats = [] labels = [] for line in test_features: parts = line.rstrip("\n").split("\t") feats.append([float(v) for v in parts[:-1]]) labels.append(int(parts[-1])) dataset = np.array(feats) prediction = clf.predict_proba(dataset) pos = 0 good = [] wrong = [] for pred in prediction: if labels[pos] == 1: good.append(pred[1]) else: wrong.append(pred[1]) pos += 1 hgood = np.histogram(good, bins=np.arange(0, 1.1, 0.1)) hwrong = np.histogram(wrong, bins=np.arange(0, 1.1, 0.1)) return hgood[0].tolist(), hwrong[0].tolist()
print 'Accuracy = %s' % (float(equal)/len(Y_pred)) # Loading the dataset dataset_url = 'http://mlr.cs.umass.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv' data = pd.read_csv(dataset_url, sep=';') # Splitting into training and testing datasets Y = data.quality X = data.drop('quality', axis=1) X_train, X_test, Y_train, Y_test = model_selection.train_test_split( X, Y, test_size=0.25, random_state=123, stratify=Y) # Preprocessing the Data pipeline = make_pipeline(preprocessing.StandardScaler(), RandomForestClassifier(n_estimators=500, criterion='entropy')) # Setting the HyperParameters hyperparameters = {'randomforestclassifier__max_features': [ 'auto', 'sqrt', 'log2'], 'randomforestclassifier__max_depth': [None, 5, 3, 1, 7]} # Fitting the classfier clf = GridSearchCV(pipeline, hyperparameters, cv=5) clf.fit(X_train, Y_train) # Making prediction on the test set Y_pred = clf.predict(X_test) # Calculating Accuracy accuracy(Y_test.tolist(), Y_pred)
from sklearn.linear_model import LogisticRegression import mglearn import matplotlib.pyplot as plt from sklearn.model_selection import GridSearchCV from sklearn.pipeline import make_pipeline data = pd.read_excel("Data/Data_TrainProcessed.xlsx", error_bad_lines=False, encoding='utf-8') y_score = (data['Rate'].values).reshape(-1, 1) binaray = Binarizer(threshold=3) y = binaray.fit_transform(y_score) y_train = np.array(y).flatten() pipe = make_pipeline( TfidfVectorizer(min_df=5, max_df=0.8, max_features=3000, sublinear_tf=True), LogisticRegression()) param_grid = {'logisticregression__C': [0.001, 0.01, 0.1, 1, 10]} grid = GridSearchCV(pipe, param_grid, cv=5) grid.fit(data['Review'].values.astype('U'), y_train) vectorizer = grid.best_estimator_.named_steps["tfidfvectorizer"] # transform the training dataset X_train = vectorizer.transform(data['Review'].values.astype('U')) # find maximum value for each of the features over the dataset max_value = X_train.max(axis=0).toarray().ravel() sorted_by_tfidf = max_value.argsort() # get feature names feature_names = np.array(vectorizer.get_feature_names()) mglearn.tools.visualize_coefficients( grid.best_estimator_.named_steps["logisticregression"].coef_, feature_names,
X_array = np.asarray(X) X_array = np.asarray(X) is_all_zero = np.all(X_array == 0) if is_all_zero: print('array is all zeros') else: print('Array is good') choice_length = np.count_nonzero(~np.isnan(labels)) X, y = shuffle(X_array, labels) X = X[:choice_length] y = y[:choice_length].fillna(0) scaler = MinMaxScaler(feature_range=(-1, 1)) mm = make_pipeline(MinMaxScaler(), Normalizer()) X = mm.fit_transform(X) rbf_feature = RBFSampler(gamma=1.5, random_state=10) ps = PolynomialCountSketch(degree=11, random_state=1) X_rbf_features = rbf_feature.fit_transform(X) X_poly_features = ps.fit_transform(X) # We want to get TSNE embedding with 2 dimensions n_components = 3 tsne = TSNE(n_components) tsne_result = tsne.fit_transform(X_rbf_features) locationFileName = os.path.join( figuresDestination, str(sorted(symbols)[symbolIdx]) + '_idx_' + str(idx) + 'date_' + str(dateIdx) + '_' + str(labelName) + '_tsne_rbf_kernelised.png')
from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA from sklearn.pipeline import make_pipeline from sklearn import metrics import os train_prefix = "temp/final/train_5/" test_prefix = "temp/final/test/" input = [] label = [] for file in os.listdir(train_prefix): data = open(train_prefix + file,"r") data_read = csv.reader(data) for lines in data_read: input.append([float(elem) for elem in lines[0:-1]]) label.append(float(lines[-1])) X = np.array(input) Y = np.array(label) h = .02 # step size in the mesh X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2) std_clf = make_pipeline(StandardScaler(), linear_model.LogisticRegression(C=1e5)) std_clf.fit(X_train, y_train) pred_test_std = std_clf.predict(X_test) print('{:.2%}\n'.format(metrics.accuracy_score(y_test, pred_test_std)))
data_cfg, **repr_cfg[utils_section]) # change y in case of classification if 'classification' == task_cfg[utils_section]['task']: log_scale = True if 'log' == data_cfg[csv_section]['scale'].lower().strip( ) else False y = task_cfg[utils_section]['cutoffs'](y, log_scale) test_y = task_cfg[utils_section]['cutoffs'](test_y, log_scale) training_features = x training_target = y testing_features = test_x # Average CV score on the training set was: 0.8734423037820145 exported_pipeline = make_pipeline( Binarizer(threshold=0.15000000000000002), ExtraTreesClassifier(bootstrap=False, criterion="entropy", max_depth=20, max_features=0.15000000000000002, max_samples=None, min_samples_leaf=2, min_samples_split=7, n_estimators=500)) # Fix random state for all the steps in exported pipeline set_param_recursive(exported_pipeline.steps, 'random_state', 666) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) print('Success.')
def calculate_score(deg, X, y): pipe = make_pipeline(StandardScaler(), PolynomialFeatures(deg), BayesianRidge(normalize=False)) # type: Pipeline pipe.fit(X, y) return pipe.score(X, y)
#Validation function n_folds = 5 def rmsle_cv(model): kf = KFold( n_folds, shuffle=True, random_state=42).get_n_splits(train.values) rmse = np.sqrt(-cross_val_score( model, train.values, y_train, scoring="neg_mean_squared_error", cv=kf)) print("rmse", rmse) return (rmse) # 模型 # LASSO Regression : lasso = make_pipeline(RobustScaler(), Lasso(alpha=0.0005, random_state=1)) # Elastic Net Regression ENet = make_pipeline( RobustScaler(), ElasticNet( alpha=0.0005, l1_ratio=.9, random_state=3)) # Kernel Ridge Regression KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5) # Gradient Boosting Regression GBoost = GradientBoostingRegressor( n_estimators=3000, learning_rate=0.05, max_depth=4, max_features='sqrt', min_samples_leaf=15, min_samples_split=10, loss='huber',
test = rated_df.loc[~rated_df.index.isin(train.index)] print("Train rows: {}".format(len(train.index))) print("Test rows: {}".format(len(test.index))) # In[# Train Model with Data from sklearn.preprocessing import StandardScaler from sklearn.model_selection import LeaveOneOut from sklearn.neural_network import MLPClassifier from sklearn.model_selection import GridSearchCV from sklearn.pipeline import make_pipeline sc = StandardScaler() mlc = MLPClassifier(activation='relu', random_state=1, nesterovs_momentum=True) loo = LeaveOneOut() pipe = make_pipeline(sc, mlc) # Train the Model and check wich of the Parameters works best parameters = { "mlpclassifier__hidden_layer_sizes": [(300, ), (500, )], "mlpclassifier__solver": ("sgd", "lbfgs"), "mlpclassifier__max_iter": [500, 1000, 2000], "mlpclassifier__learning_rate_init": [0.001, 0.1] } MLPClassifierModel = GridSearchCV(pipe, parameters, n_jobs=-1, cv=5) MLPClassifierModel.fit(train[features], train[target]) # Save Model to file to used it later file = open("test3_k_t_o_MLPClassifierModel_10_comp.pkl", 'wb') pickle.dump(MLPClassifierModel, file) file.close()
# %% # Scikit-learn provides an estimator called # :class:`~sklearn.linear_model.LinearLarsIC` that uses either Akaike's # information criterion (AIC) or the Bayesian information criterion (BIC) to # select the best model. Before fitting # this model, we will scale the dataset. # # In the following, we are going to fit two models to compare the values # reported by AIC and BIC. from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LassoLarsIC from sklearn.pipeline import make_pipeline lasso_lars_ic = make_pipeline(StandardScaler(), LassoLarsIC(criterion="aic", normalize=False)).fit(X, y) # %% # To be in line with the defintion in [ZHT2007]_, we need to rescale the # AIC and the BIC. Indeed, Zou et al. are ignoring some constant terms # compared to the original definition of AIC derived from the maximum # log-likelihood of a linear model. You can refer to # :ref:`mathematical detail section for the User Guide <lasso_lars_ic>`. def zou_et_al_criterion_rescaling(criterion, n_samples, noise_variance): """Rescale the information criterion to follow the definition of Zou et al.""" return criterion - n_samples * np.log( 2 * np.pi * noise_variance) - n_samples
data_path = os.path.join(os.path.expanduser('~'), 'kaggle/Caterpillar/data') dataset = 'br6lin' train = pd.read_csv(os.path.join(stage1, dataset, 'training.csv')) target = train.cost train.drop(['cost'], axis=1, inplace=True) tube_assembly_ids = pd.read_csv(os.path.join(data_path, 'train_set.csv'), usecols=['tube_assembly_id']) train['tube_assembly_id'] = tube_assembly_ids test = pd.read_csv(os.path.join(stage1, dataset, 'testing.csv')) test.drop(['cost'], axis=1, inplace=True) preprocess = make_pipeline( ScaleContinuousOnly(), # StandardScaler(), ) cwd = os.getcwd() refit_train_val_dir = 'refit_train_val' refit_train_dir = 'refit_train' #epoch_save_range = range(30, 101, 5) epoch_save_range = list(range(10, 11, 5)) t0 = time.time() n_folds = 3 do_folds = 3 def KLabelFold(labels, n_folds=3, shuffle=False, random_state=None): kfold = KFold(labels.nunique(), n_folds=n_folds, shuffle=shuffle, random_state=random_state)
y -= y.mean() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0) ############################################################################## # Partial Dependence computation for multi-layer perceptron # --------------------------------------------------------- # # Let's fit a MLPRegressor and compute single-variable partial dependence # plots print("Training MLPRegressor...") tic = time() est = make_pipeline(QuantileTransformer(), MLPRegressor(hidden_layer_sizes=(50, 50), learning_rate_init=0.01, early_stopping=True)) est.fit(X_train, y_train) print("done in {:.3f}s".format(time() - tic)) print("Test R2 score: {:.2f}".format(est.score(X_test, y_test))) ############################################################################## # We configured a pipeline to scale the numerical input features and tuned the # neural network size and learning rate to get a reasonable compromise between # training time and predictive performance on a test set. # # Importantly, this tabular dataset has very different dynamic ranges for its # features. Neural networks tend to be very sensitive to features with varying # scales and forgetting to preprocess the numeric feature would lead to a very # poor model. #
"classifier__solver":['newton-cg','saga','sag','liblinear'] ##This solvers don't allow L1 penalty }, {"classifier": [RandomForestClassifier()], "classifier__n_estimators": [10, 100, 1000], "classifier__max_depth":[5,8,15,25,30,None], "classifier__min_samples_leaf":[1,2,5,10,15,100], "classifier__max_leaf_nodes": [2, 5,10]}] # create a gridsearch of the pipeline, the fit the best model gridsearch = GridSearchCV(pipe, grid_param, cv=5, verbose=0,n_jobs=-1) # Fit grid search best_model = gridsearch.fit(X_train,y_train) print(best_model.best_estimator_) print("The mean accuracy of the model is:",best_model.score(X_test,y_test)) #MakePipelines In SKLearn from sklearn.pipeline import make_pipeline # Create a pipeline pipe = make_pipeline((RandomForestClassifier())) # Create dictionary with candidate learning algorithms and their hyperparameters grid_param = [ {"randomforestclassifier": [RandomForestClassifier()], "randomforestclassifier__n_estimators": [10, 100, 1000], "randomforestclassifier__max_depth":[5,8,15,25,30,None], "randomforestclassifier__min_samples_leaf":[1,2,5,10,15,100], "randomforestclassifier__max_leaf_nodes": [2, 5,10]}] # create a gridsearch of the pipeline, the fit the best model gridsearch = GridSearchCV(pipe, grid_param, cv=5, verbose=0,n_jobs=-1) # Fit grid search best_model = gridsearch.fit(X_train,y_train) best_model.score(X_test,y_test)
print("%d sentences" % len(dataset.data)) print("%d categories" % len(dataset.target_names)) print() print( "Extracting features from the training dataset using a sparse vectorizer") t0 = time() if opts.use_hashing: if opts.use_idf: # Perform an IDF normalization on the output of HashingVectorizer hasher = HashingVectorizer(n_features=opts.n_features, stop_words='english', alternate_sign=False, norm=None, binary=False) vectorizer = make_pipeline(hasher, TfidfTransformer(norm=opts.norm)) else: vectorizer = HashingVectorizer(n_features=opts.n_features, stop_words='english', alternate_sign=False, norm=opts.norm, binary=False) else: vectorizer = TfidfVectorizer(max_df=0.5, max_features=opts.n_features, min_df=2, stop_words='english', use_idf=opts.use_idf, norm=opts.norm) X = vectorizer.fit_transform(dataset.data)
# commented out after the run #pipeline_optimizer.fit(train, train_labels) # export optimized code # commented out after the run #pipeline_optimizer.export('tpot_titanic_pipeline.py') # import libraries from sklearn.pipeline import make_pipeline # create the pipeline from TPOT # original pipeline inluded a Binarizer and RBFSampler which scored only 0.78947 exported_pipeline = make_pipeline( RandomForestClassifier(bootstrap=False, criterion="gini", max_features=0.45, min_samples_leaf=14, min_samples_split=13, n_estimators=100)) # fit the pipeline on the train data exported_pipeline.fit(train, train_labels) # predict on the test data results = exported_pipeline.predict(test) # In[ ]: # make a submission dataframe submit = df_test.loc[:, ['PassengerId']] submit.loc[:, 'Survived'] = results
km.labels_, sample_size=1000, random_state=random_state) logger.debug("Silhouette Coefficient: %0.3f" % silhouette_score) logger.debug("Homogeneity: %0.3f" % homogeneity) text_to_display = 'homogeneity: %.2f\nsilhouette: %.2f' % ( homogeneity, silhouette_score) elif do_lsa: topic_model_name = 'LSA' n_components = 300 n_components = min(100, tfidf.shape[1] - 1) svd = TruncatedSVD(n_components, random_state=random_state) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) lsa_results = lsa.fit_transform(tfidf) explained_variance = svd.explained_variance_ratio_.sum() logger.debug('Explained variance of the SVD step: %d' % int(explained_variance * 100)) true_k = tfidf.shape[0] * tfidf.shape[1] / tfidf.nnz logger.debug('we are looking for %d clusters' % true_k) verbose = False if minibatch: km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1, init_size=1000,
# 3. Load red wine data. dataset_url = 'http://mlr.cs.umass.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv' data = pd.read_csv(dataset_url, sep=';') # 4. Split data into training and test sets y = data.quality X = data.drop('quality', axis=1) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123, stratify=y) # 5. Declare data preprocessing steps pipeline = make_pipeline(preprocessing.StandardScaler(), RandomForestRegressor(n_estimators=100)) # 6. Declare hyperparameters to tune hyperparameters = { 'randomforestregressor__max_features': ['auto', 'sqrt', 'log2'], 'randomforestregressor__max_depth': [None, 5, 3, 1] } # 7. Tune model using cross-validation pipeline clf = GridSearchCV(pipeline, hyperparameters, cv=10) clf.fit(X_train, y_train) # 8. Refit on the entire training set # No additional code needed if clf.refit == True (default is True)
# or indices provided. We will obtain as many subsets as the number of # transformers passed into the `ColumnTransformer`. # * It **transforms each subsets**. A specific transformer is applied to # each subset: it will internally call `fit_transform` or `transform`. The # output of this step is a set of transformed datasets. # * It then **concatenate the transformed datasets** into a single dataset. # The important thing is that `ColumnTransformer` is like any other # scikit-learn transformer. In particular it can be combined with a classifier # in a `Pipeline`: # %% from sklearn.linear_model import LogisticRegression from sklearn.pipeline import make_pipeline model = make_pipeline(preprocessor, LogisticRegression(max_iter=500)) # %% [markdown] # Starting from `scikit-learn 0.23`, the notebooks can display an interactive # view of the pipelines. # %% from sklearn import set_config set_config(display='diagram') model # %% [markdown] # The final model is more complex than the previous models but still follows # the same API (the same set of methods that can be called by the user): # # - the `fit` method is called to preprocess the data and then train the
def PolynomialLasso(degree=1, alpha=1): return make_pipeline(PolynomialFeatures(degree = degree,\ include_bias = False), StandardScaler(), Lasso(alpha = alpha))
def PolynomialRegression(degree=1): return make_pipeline(PolynomialFeatures(degree = degree,\ include_bias = False), LinearRegression())
# Create separate object for target variable y = df.<feature> # Create separate object for input features X = df.drop('<target feature>', axis = 1) #Use to split data into train and test data train_test_split # Function for creating model pipelines from sklearn.pipeline import make_pipeline # For standardization from sklearn.preprocessing import StandardScaler #Good practice: create dictionary holding different algos pipelines = { 'lasso': make_pipeline(StandardScaler(), lasso()), ... } #Do same thing for hyperparameters grid; one per algo lasso_hyperparameters = { 'lasso__alpha' : [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10] } # Create hyperparameters dictionary hyperparameters = { 'lasso': lasso_hyperparameters, ... } # List tuneable hyperparameters of our Lasso pipeline
labels = epochs.events[:, -1] evoked = epochs.average() ############################################################################### # Decoding in tangent space with a logistic regression n_components = 2 # pick some components # Define a monte-carlo cross-validation generator (reduce variance): cv = KFold(n_splits=10, shuffle=True, random_state=42) epochs_data = epochs.get_data() clf = make_pipeline( XdawnCovariances(n_components), TangentSpace(metric="riemann"), LogisticRegression(), ) preds = np.zeros(len(labels)) for train_idx, test_idx in cv.split(epochs_data): y_train, y_test = labels[train_idx], labels[test_idx] clf.fit(epochs_data[train_idx], y_train) preds[test_idx] = clf.predict(epochs_data[test_idx]) # Printing the results acc = np.mean(preds == labels) print("Classification accuracy: %f " % (acc))
import numpy as np import pandas as pd from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier from sklearn.pipeline import make_pipeline, make_union from sklearn.preprocessing import Normalizer from tpot.builtins import StackingEstimator # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Score on the training set was:0.032380308137753055 exported_pipeline = make_pipeline( StackingEstimator(estimator=RandomForestClassifier(bootstrap=False, criterion="gini", max_features=0.55, min_samples_leaf=2, min_samples_split=4, n_estimators=100)), Normalizer(norm="l1"), KNeighborsClassifier(n_neighbors=45, p=1, weights="uniform")) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
def test_bagging_with_pipeline(): estimator = BaggingClassifier(make_pipeline(SelectKBest(k=1), DecisionTreeClassifier()), max_features=2) estimator.fit(iris.data, iris.target)
rf = RandomForestClassifier(n_estimators=400, verbose=100, n_jobs=-1, random_state=0, max_samples=5000) rf.fit(X_train_rf, y_train) feature_selector = SelectFromModel(rf, prefit=True, max_features=100000) svc_set = pd.concat([X_train, y_train], axis=1) svc_set = svc_set.sample(100000, random_state=0) svc_X = svc_set['review'] svc_y = svc_set['label'] svc_X = vectorizer.transform(svc_X) svc_X = feature_selector.transform(svc_X) svc = SVC(cache_size=1000, random_state=0) svc.fit(svc_X, svc_y) final_pipe = make_pipeline(vectorizer, feature_selector, svc) # 95.92% accuracy final_pipe.score(X_test, y_test) dump(final_pipe, 'trained_models/good_svc_pruned')
df = pd.read_csv('https://archive.ics.uci.edu/ml/' 'machine-learning-databases' '/breast-cancer-wisconsin/wdbc.data', header=None) X = df.loc[:, 2:].values y = df.loc[:, 1].values le = LabelEncoder() y = le.fit_transform(y) le.classes_ le.transform(['M', 'B']) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y, random_state=1) pipe_lr = make_pipeline(StandardScaler(), LogisticRegression(solver='liblinear', penalty='l2', random_state=1)) train_sizes, train_scores, test_scores = learning_curve(estimator=pipe_lr, X=X_train, y=y_train, train_sizes=np.linspace(0.1, 1.0, 10), cv=10, n_jobs=1) train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) plt.plot(train_sizes, train_mean,