def test_sparse_to_dense(): todense = DenseTransformer() tfidf = TfidfTransformer() X_t = tfidf.fit_transform([[1, 2, 3]]) assert issparse(X_t) X_dense = todense.transform(X_t) expect = np.array([[0.26726124, 0.53452248, 0.80178373]]) assert np.allclose(X_dense, expect)
def getProcessing(self, config): map = { 0: Normalizer(), 1: OneHotEncoder(), 2: DenseTransformer() } return ('pre_' + str(config), map[config])
def test_pipeline(): rf = RandomForestClassifier(n_estimators=10) param_grid = [{'randomforestclassifier__n_estimators': [1, 5, 10]}] pipe = make_pipeline(StandardScaler(), DenseTransformer(), rf) if Version(sklearn_version) < Version("0.24.1"): grid = GridSearchCV(pipe, param_grid, cv=3, n_jobs=1, iid=False) else: grid = GridSearchCV(pipe, param_grid, cv=3, n_jobs=1) grid.fit(X, y)
def stacking_classifier(classifiers, random_state=42): sclf = StackingCVClassifier(classifiers=[c[1] for c in classifiers], meta_classifier=LogisticRegression(solver='lbfgs', multi_class='auto', random_state=random_state), use_features_in_secondary=True) return Pipeline([ ('vect', init_vectorizer()), ('denser', DenseTransformer()), # StackingCV is not working with Sparse matrix (maybe this is why it sucks so much) ('sclf', sclf) ])
def fit(self, dataset, train_data): y_train = dataset.labels_from(train_data) # feature_transformation = ColumnTransformer(transformers=[ # ('categorical_features', OneHotEncoder(handle_unknown='ignore'), dataset.categorical_columns), # ('scaled_numeric', StandardScaler(), dataset.numerical_columns) # ], sparse_threshold=0) if len(dataset.textual_columns) > 1: raise Exception( 'Can only handle one textual column at the moment.') sparse_threshold = 0.3 textual_column = [] if len(dataset.textual_columns) > 0: sparse_threshold = 0.0 textual_column = dataset.textual_columns[0] feature_transformation = ColumnTransformer( transformers=[ ('categorical_features', OneHotEncoder(handle_unknown='ignore'), dataset.categorical_columns), ('scaled_numeric', StandardScaler(), dataset.numerical_columns), ('textual_features', HashingVectorizer(ngram_range=(1, 3), n_features=10000), textual_column), ], sparse_threshold=sparse_threshold) make_keras_picklable() nn_model = keras.wrappers.scikit_learn.KerasClassifier( build_fn=self.create_model) pipeline = Pipeline([('features', feature_transformation), ('todense', DenseTransformer()), ('learner', nn_model)]) param_grid = { 'learner__epochs': [50], 'learner__batch_size': [1024], 'learner__size_1': [4, 8], 'learner__size_2': [2, 4], 'learner__verbose': [1] } model = GridSearchCV(pipeline, param_grid, scoring=self.scoring, cv=5, verbose=2).fit(train_data, y_train) return model
def build_sentiment(classifier, transformer, name, with_proba = True, **pmml_options): pipeline = PMMLPipeline([ ("transformer", transformer), ("densifier", DenseTransformer()), ("selector", SelectKBest(f_classif, k = 500)), ("classifier", classifier) ]) pipeline.fit(sentiment_X, sentiment_y) pipeline.configure(**pmml_options) store_pmml(pipeline, name) score = DataFrame(pipeline.predict(sentiment_X), columns = ["Score"]) if with_proba: score_proba = DataFrame(pipeline.predict_proba(sentiment_X), columns = ["probability(0)", "probability(1)"]) score = pandas.concat((score, score_proba), axis = 1) store_csv(score, name)
def train_pipeline(X, y): """ Builds and trains a machine learning pipeline """ numerical_col = [ 'Num nights', 'Adults', 'Children', 'Session duration', 'Sessions', 'Avg. session length (sec)', 'Avg. pageviews per session', 'Pageviews', 'Hits', 'Created to arrival' ] categorical_col = [ 'Language', 'Website', 'Enquiry type', 'Enquiry status', 'Client budget', 'Country code', 'GA source', 'GA medium', 'Device', 'Created month' ] binary_col = [ 'Flights booked', 'User agent', 'User repeat', 'User referral' ] text_col = ['Click path', 'GA keyword'] target = ['is booking'] # Numerical pipeline numerical_pipeline = make_pipeline(ColumnSelector(cols=numerical_col), SimpleImputer(strategy="median"), StandardScaler()) # Categorical pipeline categorical_pipeline = make_pipeline( ColumnSelector(cols=categorical_col), SimpleImputer(strategy="constant", fill_value='None'), OneHotEncoder()) # Binary pipeline binary_pipeline = make_pipeline(ColumnSelector(cols=binary_col), SimpleImputer(strategy="most_frequent"), BinaryEncoder()) # Text pipelines text_pipeline_1 = make_pipeline( ColumnSelector(cols=['Click path']), SimpleImputer(strategy='constant', fill_value=''), ReshapeTransformer(), HashingVectorizer(n_features=2**11), DenseTransformer()) text_pipeline_2 = make_pipeline( ColumnSelector(cols=['GA keyword']), SimpleImputer(strategy='constant', fill_value=''), ReshapeTransformer(), TfidfVectorizer(), DenseTransformer()) # Pipeline union processing_pipeline = make_union(numerical_pipeline, categorical_pipeline, binary_pipeline, text_pipeline_1, text_pipeline_2) estimator = BalancedRandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini', max_depth=60, max_features='sqrt', max_leaf_nodes=None, min_impurity_decrease=0.0, min_samples_leaf=1, min_samples_split=5, min_weight_fraction_leaf=0.0, n_estimators=472, n_jobs=1, oob_score=False, random_state=None, replacement=False, sampling_strategy='auto', verbose=0, warm_start=False) predictive_pipeline = make_pipeline(processing_pipeline, estimator) predictive_pipeline.fit(X, y) return predictive_pipeline
print("_________________________________") X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.1, random_state=42) # count_vect = CountVectorizer(max_features=5000, lowercase=True, ngram_range=(3, 3), analyzer="word") count_vect = CountVectorizer(max_features=10000, min_df=1, tokenizer=nltk.word_tokenize) # selectKBest = SelectKBest(chi2, k=1000) # truncatedSVD = TruncatedSVD(n_components=5000, n_iter=15, random_state=42) # combined_features = FeatureUnion([("chi2", truncatedSVD), ("univ_select", selectKBest)]) tfidf_transformer = TfidfTransformer() dense_transformer = DenseTransformer() clf_LG = Pipeline([ ('count_v', count_vect), ('tfidf', tfidf_transformer), # ('features', combined_features), ('to_dens', DenseTransformer()), ('lgc', RandomForestClassifier(max_depth=100, random_state=0)) ]) clf_NB = Pipeline([ ('count_v', count_vect), ('tfidf', tfidf_transformer), # ('features', combined_features), ('to_dens', DenseTransformer()), ('lnb', GaussianNB())
from sklearn.grid_search import GridSearchCV from sklearn.ensemble import RandomForestClassifier from sklearn.feature_extraction.text import CountVectorizer from mlxtend.preprocessing import DenseTransformer import re import numpy as np X_train = np.array(['abc def ghi', 'this is a test', 'this is a test', 'this is a test']) y_train = np.array([0, 0, 1, 1]) pipe_1 = Pipeline([ ('vect', CountVectorizer()), ('to_dense', DenseTransformer()), ('clf', RandomForestClassifier()) ]) parameters_1 = dict( clf__n_estimators = [50, 100, 200], clf__max_features=['sqrt', 'log2', None] ) grid_search_1 = GridSearchCV(pipe_1, parameters_1, n_jobs=1, verbose=1, scoring='accuracy', cv=2)
def test_pipeline(): rf = RandomForestClassifier() param_grid = [{'randomforestclassifier__n_estimators': [1, 5, 10]}] pipe = make_pipeline(StandardScaler(), DenseTransformer(), rf) grid = GridSearchCV(pipe, param_grid, cv=3, n_jobs=1) grid.fit(X, y)
ypred=(knc.predict(xtest)) ypred1=knc.predict(xtrain) print(ypred) print(list(le.inverse_transform(ypred))) print(knc.predict_proba(xtest)) print(knc.score(xtrain,ytrain)) print(knc.kneighbors()) print(knc.kneighbors_graph()) print(r2_score(ytest,ypred)) from sklearn.pipeline import make_pipeline from sklearn.neighbors import NeighborhoodComponentsAnalysis nca=NeighborhoodComponentsAnalysis(random_state=42) from mlxtend.preprocessing import DenseTransformer nca_pipe=(make_pipeline((NeighborhoodComponentsAnalysis()),(KNeighborsClassifier()))) print(nca_pipe) dense=DenseTransformer() print(dense.fit(xtrain,ytrain)) ##xtrain,ytrain=dense.transform(xtrain,ytrain) ##print(nca.fit(xtrain,ytrain)) ##knc.fit(nca.transform(xtrain,ytrain)) ##print(knc.score(nca.transform(xtest,ytest)) ##print(nca_pipe.fit(xtrain,ytrain)) ##print(nca_pipe.score(xtrain,ytrain)) print(classification_report(ytest,ypred)) print(accuracy_score(ytest,ypred)) print(accuracy_score(ytrain,ypred1)) confusionmatrix=confusion_matrix(ypred,ytest) print(confusionmatrix) rmse=math.sqrt(mean_squared_error(ypred,ytest)) print(rmse) plt.plot(ypred)
def test_dense_to_dense(): todense = DenseTransformer(return_copy=False) np.testing.assert_array_equal(X, todense.transform(X))
} # evaluate each model for model_name, parameters in model_parameters.items(): model = models[model_name] # define steps steps = list() steps.append( ('c', OneHotEncoder(handle_unknown='ignore'), cat_columns_train)) steps.append(('n', MinMaxScaler(), num_columns_train)) # one hot encode categorical, normalize numerical ct = ColumnTransformer(steps) # wrap the model i a pipeline pipeline = Pipeline(steps=[('t', ct), ('to_dense', DenseTransformer()), (model_name, model)]) # evaluate the model and store results grid_search_acc = evaluate_model_gridsearch(X_train, y_train.values.ravel(), pipeline, scorer=scoring_method_accuracy, parameters=parameters) acc_best_model = grid_search_acc.best_estimator_ acc_best_score = grid_search_acc.best_score_ acc_best_params = grid_search_acc.best_params_ grid_accuracy_scores.append(acc_best_score) print(model_name) print("- acc_best_score =", acc_best_score) print("acc_best parameters:") for k, v in acc_best_params.items():
def gridsearch_with_classifiers_baseline(self): class_report = [] results = [] for vec, n in zip( [CountVectorizer(), TfidfVectorizer()], ["Count", "Tfidf"]): print("loaded the vectorizer: {}\n\n\{}".format(n, vec)) for name, classifier, params in zip(self.names, self.classifiers, self.parameters): my_dict = {} final_results = [] logging.info("Starting gridsearch CV..") logging.info( "Classifier name: {}\n classifier:{}\n params{}\n".format( name, classifier, params)) clf_pipe = Pipeline([ ('vect', vec), ('to_dense', DenseTransformer()), ('clf', classifier), ]) #clf_pipe = make_pipeline(vec, FunctionTransformer(lambda x: x.todense(), accept_sparse=True), classifier) gs_clf = GridSearchCV(clf_pipe, param_grid=params, cv=2) clf = gs_clf.fit(self.X_train, self.y_train) self.X_train score = clf.score(self.X_test, self.y_test) logging.info("{} score: {}".format(name, score)) logging.info("{} are the best estimators".format( clf.best_estimator_)) results_to_dict = classification_report( (clf.best_estimator_.predict(self.X_test)), self.y_test, output_dict=True) results_to_dict['classifier'] = name results_to_dict['parameters'] = clf.best_params_ results_to_dict['vectorizer'] = n results_to_dict['model'] = "baseline" logging.info( "Created dictionary with classification report: \n\n{}". format(results_to_dict)) class_report.append(results_to_dict) y_hats = clf.predict(self.X_test) final_results.append({ "predicted": y_hats, "actual": self.y_test.values, "classifier": name, "vectorizer": n, "model": "baseline" }) results.append(final_results) return class_report, results
def fit(self, dataset, train_data): y_train = dataset.labels_from(train_data) # feature_transformation = ColumnTransformer(transformers=[ # ('categorical_features', OneHotEncoder(handle_unknown='ignore'), dataset.categorical_columns), # ('scaled_numeric', StandardScaler(), dataset.numerical_columns) # ], sparse_threshold=0) if len(dataset.textual_columns) > 1: raise Exception( 'Can only handle one textual column at the moment.') sparse_threshold = 0.3 textual_column = [] if len(dataset.textual_columns) > 0: sparse_threshold = 0.0 textual_column = dataset.textual_columns[0] feature_transformation = ColumnTransformer( transformers=[ ('categorical_features', OneHotEncoder(handle_unknown='ignore'), dataset.categorical_columns), ('scaled_numeric', StandardScaler(), dataset.numerical_columns), ('textual_features', HashingVectorizer(ngram_range=(1, 3), n_features=10000), textual_column), ], sparse_threshold=sparse_threshold) def create_model(size_1, size_2): nn = keras.Sequential([ keras.layers.Dense(size_1, activation=tf.nn.relu), keras.layers.Dense(size_2, activation=tf.nn.relu), keras.layers.Dense(2, activation=tf.nn.softmax) ]) nn.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=[ 'accuracy' ]) # TODO figure out how to use roc_auc here... return nn nn_model = keras.wrappers.scikit_learn.KerasClassifier( build_fn=create_model) pipeline = Pipeline([('features', feature_transformation), ('todense', DenseTransformer()), ('learner', nn_model)]) param_grid = { 'learner__epochs': [50], 'learner__batch_size': [1024], 'learner__size_1': [4, 8], 'learner__size_2': [2, 4], 'learner__verbose': [1] } model = GridSearchCV(pipeline, param_grid, scoring=self.scoring, cv=5, verbose=2).fit(train_data, y_train) return model
if args.pred: #do i even need the if statement if it is a required argument text.out() # #report F1 score print("F1 Score is:", text.report_f1) from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.pipeline import Pipeline from xgboost import XGBClassifier from mlxtend.preprocessing import DenseTransformer pipeline = Pipeline([ ("vectorizer", TfidfVectorizer()), ("densifier", DenseTransformer()), ("classifier", XGBClassifier(random_state = 13)) ]) pipeline.fit(X_train, y_train) tfidf = TfidfVectorizer(max_features=self.max_feat) train['vector']=vectorizer.fit_transform(train['item_name']) train=train.drop('item_name',axis=1) y=train.category_id train=train.drop('category_id',axis=1) X_train, X_test, y_train, y_test = train_test_split(train,y, test_size=0.10,stratify=y,random_state=42) import xgboost as xgb
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler from dataprep_1 import * from dataprep_2 import cat_columns_train, num_columns_train steps = list() steps.append(('c', OneHotEncoder(handle_unknown='ignore'), cat_columns_train)) steps.append(('n', MinMaxScaler(), num_columns_train)) ct = ColumnTransformer(steps) # THIS IS TO DO # WHat model? # WHat parameters? final_clf = LogisticRegression( C=0.1, penalty='none', random_state=42) # TODO: Include tuned parameters pipeline = Pipeline( steps=[('t', ct), ('to_dense', DenseTransformer()), ('insert-modelname', final_clf)]) pipeline.fit(X_train, y_train.values.ravel()) final_prediction = pipeline.predict(X_test) prediction = np.array( final_prediction) # TODO replace this with you own prediction pd.DataFrame(prediction).to_csv("GROUP_classes_problem_census.txt", index=False, header=False)
data, label, class_names = data_set.get_train_data_set() indexs = random.sample(range(len(data)), 50000) data = data[indexs] label = label[indexs] X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.33, random_state=42) count_vect = CountVectorizer() selectKBest = SelectKBest(k=2000) truncatedSVD = TruncatedSVD(n_components=5, n_iter=7, random_state=42) combined_features = FeatureUnion([("chi2", chi2()), ("univ_select", selectKBest)]) dense_transformer = DenseTransformer() clf_NB = GaussianNB() pipeline_NB = Pipeline([('count_v', CountVectorizer()), ('features', combined_features), ('to_dens', DenseTransformer()), ('clf', clf_NB)]) pipeline_NB = pipeline_NB.fit(X_train, y_train) y_pred = pipeline_NB.predict(X_test) print("F1 score - NB:", f1_score(y_test, pipeline_NB.predict(X_test), average='micro')) print("Accuracy Score - NB:", accuracy_score(y_test, pipeline_NB.predict(X_test))) cnf_matrix = confusion_matrix(y_test, y_pred) plt.figure() plt = plot_confusion_matrix(cnf_matrix,
cv = RepeatedStratifiedKFold(n_splits=2, n_repeats=1, random_state=42) # evaluate model scores = cross_val_score(model, X, y, scoring=scorer, cv=cv, n_jobs=6) return scores # evaluate each model for name, model in models.items(): # define steps steps = list() steps.append(('c', OneHotEncoder(handle_unknown='ignore'), cat_columns_train)) steps.append(('n', MinMaxScaler(), num_columns_train)) # one hot encode categorical, normalize numerical ct = ColumnTransformer(steps) # wrap the model i a pipeline pipeline = Pipeline(steps=[('t', ct), ('to_dense', DenseTransformer()), ('m', model)]) # evaluate the model and store results acc_score = evaluate_model(X_train, y_train.values.ravel(), pipeline, scorer=scoring_method_accuracy) accuracy_scores.append(np.mean(acc_score)) f1 = evaluate_model(X_train, y_train.values.ravel(), pipeline, scorer=scoring_method_f1) f1_scores.append(np.mean(f1)) auc_sco = evaluate_model(X_train, y_train.values.ravel(), pipeline, scorer=scoring_method_roc_auc) roc_auc_scores.append(np.mean(auc_sco)) model_names.append(name) # summarize performance print("acc score") print('>%s %.3f (%.3f)' % (name, mean(acc_score), std(acc_score))) print("f1 score") print('>%s %.3f (%.3f)' % (name, mean(f1), std(f1))) print("auc-roc score") print('>%s %.3f (%.3f)' % (name, mean(auc_sco), std(auc_sco)))