def classic_rsearch(x,y): from scipy.stats import uniform as sp_rand from scipy.stats import randint as sp_randint lr1 = LR(warm_start = True, penalty = 'l1', verbose = 100, max_iter = 5000) lr2 = LR(warm_start = True, penalty = 'l2', verbose = 100, max_iter = 5000) svm = SVM(verbose = True, probability = False, max_iter= 5000) rf = RF(warm_start = True, verbose = 100) #random search params lr_params = {'C': sp_rand(1, 1e5)} rf_params = {'criterion': ['gini', 'entropy'], 'n_estimators': sp_randint(10, 200), 'max_features': ['auto', 'sqrt', 'log2', None]} mlp_params = {'hidden_layer_sizes':[(64, 64), (128, 128), (256, 256), (512, 512)], 'alpha': sp_rand(1e-6, 1e-2)} svm_params = {'kernel': ['rbf', 'poly'], 'C':sp_rand (1, 1e5), 'gamma': sp_rand(1e-5, 1)} results = {}; models = [] lst = [lr1, lr2, svm, rf] names = ['LR','SVM','RF'] params = [lr_params, lr_params, svm_params, rf_params] for idx in range(len(lst)): n_iter_search = 60 start = time.time() rsearch = random_search(estimator = lst[idx], param_distributions = params[idx], n_iter=n_iter_search, scoring='roc_auc', fit_params=None, n_jobs=1, iid=True, refit=True, cv=5, verbose=0, random_state=8) rsearch.fit(x, y) models.append(rsearch) results[names[idx]] = rsearch.cv_results_ print (names[idx]+" results complete.") print("RandomizedSearchCV took %.2f seconds for %d candidates" " parameter settings." % ((time.time() - start), n_iter_search)) return (data, models)
def pick_best_features(df): """ Grid search to find best features. TODO refactor :param train: train data :param test: test data :return: """ #X = sample_data_random(df, .25) X = df[0:int(df.shape[0] * .25)] overfit_models = dict() for out in outputs: print out pipe_clf = CustomPipeline.get_transforms() clf = SGDClassifier(loss='log') tuned_parameters = {'alpha': sp_rand()} score = 'log_loss' tran_x = pipe_clf.fit_transform(X) grid = RandomizedSearchCV(clf, tuned_parameters, cv=5, scoring=score) grid.fit(tran_x, X[out]) print grid.best_estimator_ overfit_models[out] = grid.best_estimator_ return overfit_models
def set_hyperparameters(self): self.p_grid = { 'C': sp_rand(), 'penalty': ['l1', 'l2'], 'solver': ['liblinear', 'saga'], 'max_iter': sp_randint(1, 3000) }
def svmClassifier(x, y, folds): n_iter_search = 50 skf = StratifiedKFold(folds) clf = SVC(kernel='linear', random_state=7) pipe = make_pipeline(preprocessing.StandardScaler(), clf) # Hyper parameters C = sp_rand() param_dist = dict(svc__C=C) random_search = RandomizedSearchCV(pipe, param_distributions=param_dist, n_iter=n_iter_search, cv=skf, scoring='roc_auc') random_search.fit(x, y) tuned_params = random_search.best_params_ print tuned_params # Tuned Hyper parameters C = tuned_params['svc__C'] # print('\nSVM(Linear) Best Tuned Model') tuned_clf = SVC(kernel='linear', C=C, random_state=7) modelEvaluator(tuned_clf, x, y, folds)
def Predict(self, userId): param_grid = {'alpha': sp_rand()} rsearch = RandomizedSearchCV(estimator=AnalysisappConfig.model, param_distributions=param_grid, n_iter=200, cv=20, random_state=42) # model: Lasso YOU = my_ratings[my_ratings['userId'] == userId] # 유저 설정, userID 값으로 YOU 설정 rsearch.fit(YOU[genre_cols], YOU['rating']) # 장르 칼럼 #rsearch.best_estimator_.alpha intercept = rsearch.best_estimator_.intercept_ coef = rsearch.best_estimator_.coef_ """ you_profile = pd.DataFrame([intercept, *coef], # 유저 profile 생성. 장르별 계수 index=['intercept', *genre_cols], columns=['score']) """ predictions = rsearch.best_estimator_.predict(genres) genres['YOU'] = predictions rating_predictions = genres[~genres.index.isin(YOU['movieId'] )].sort_values( 'YOU', ascending=False) rating_predictions = rating_predictions.merge( movies[['movieId', 'title']], left_index=True, right_on='movieId') return rating_predictions # 예상 별점! it can show the best, worst or whatever something
def tune_logistic_regression(X_train, y_train): param_grid = { 'estimator__penalty': ['l1', 'l2'], 'estimator__class_weight': ['balanced', None], 'estimator__C': sp_rand(), } tune_hyper_parameters(LogisticRegression(solver='liblinear'), param_grid, X_train, y_train)
def RandTest(): # prepare a uniform distribution to sample for the alpha parameter param_grid = {'alpha': sp_rand()} # create and fit a ridge regression model, testing random alpha values model = Ridge() rsearch = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=100) rsearch.fit(X, y) print(rsearch) # summarize the results of the random parameter search print(rsearch.best_score_) print(rsearch.best_estimator_.alpha)
def train_classiifer(X_train, y_train, to_tune, classifier): # Initialize Classifier. clf = BayesianRidge() clf = SVR(kernel='rbf', C=1e3, gamma=0.1) #clf = RandomForestRegressor() if classifier: clf = classifier to_tune = False if to_tune: # Grid search: find optimal classifier parameters. param_grid = {'alpha_1': sp_rand(), 'alpha_2': sp_rand()} param_grid = {'C': sp_rand(), 'gamma': sp_rand()} rsearch = RandomizedSearchCV(estimator=clf, param_distributions=param_grid, n_iter=5000) rsearch.fit(X_train, y_train) # Use tuned classifier. clf = rsearch.best_estimator_ # Trains Classifier clf.fit(X_train, y_train) return clf
def r_search(x, y): #random search params lr_params = {'penalty': ['l1', 'l2'], 'C': sp_rand(1e-5, .1)} svm_params = {'kernel': ['rbf', 'linear'], 'C': sp_rand(10, 1e5)} rf_params = { 'criterion': ['gini', 'entropy'], 'n_estimators': sp_randint(50, 200), 'bootstrap': [True, False] } gbc_params = { 'learning_rate': sp_rand(1e-6, 1e-1), 'n_estimators': sp_randint(50, 200), 'loss': ['deviance', 'exponential'] } data = {} xs, ys = balanced_subsample(x, y) lst = [LR(verbose=1), RF(verbose=1), SVM(verbose=True), GBC(verbose=1)] names = ['LR', 'RF', 'SVM', 'GB'] params = [lr_params, rf_params, svm_params, gbc_params] for idx in range(len(lst)): n_iter_search = 60 start = time.time() rsearch = random_search(estimator=lst[idx], param_distributions=params[idx], n_iter=n_iter_search, scoring='roc_auc', fit_params=None, n_jobs=1, iid=True, refit=True, cv=5, verbose=0, random_state=8) rsearch.fit(xs, ys) data[names[idx]] = rsearch.cv_results_ print(names[idx] + " results complete.") print("RandomizedSearchCV took %.2f seconds for %d candidates" " parameter settings." % ((time.time() - start), n_iter_search)) return (data)
def r_search(x, y, input_shape): #random search params mlp_params = {'units': [64, 128, 256, 512], 'rate': sp_rand(.2, .9)} lstm_params = {'units': [64, 128, 256, 512], 'rate': sp_rand(.2, .9)} cnn_params = { 'filters': [32, 64, 128, 256, 512], 'filter_length': [2, 3, 4, 5, 6], 'pool_size': [2, 3] } data = {} xs, ys = balanced_subsample(x, y) lst = [ mlp_train(input_shape), lstm_train(input_shape), cnn_train(input_shape) ] names = ['MLP', 'LSTM', 'CNN'] params = [mlp_params, lstm_params, cnn_params] for idx in range(len(lst)): n_iter_search = 60 start = time.time() rsearch = random_search(estimator=lst[idx], param_distributions=params[idx], n_iter=n_iter_search, scoring='roc_auc', fit_params=None, n_jobs=1, iid=True, refit=True, cv=3, verbose=10, random_state=8) rsearch.fit(xs, ys) data[names[idx]] = rsearch.cv_results_ print(names[idx] + " results complete.") print("RandomizedSearchCV took %.2f seconds for %d candidates" " parameter settings." % ((time.time() - start), n_iter_search)) return (data)
def cross_validate(train_features, targets_train, iters): """ Runs randomized cross validation using adjustable MultinomialNB params. Returns: The model that is the most accurate """ print('starting cross validation') param_grid = {'alpha': sp_rand()} model = MultinomialNB() rsearch = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=iters) rsearch.fit(train_features, targets_train) print('finished cross validation') print('best model has a score of {} using alpha={}'.format(rsearch.best_score_, rsearch.best_estimator_.alpha)) return rsearch.best_estimator_.alpha
def test_random_parameters(X, y): """ Sometimes it is more efficient to randomly select a parameter from the given range, estimate the algorithm quality for this parameter and choose the best one. """ # prepare a uniform distribution to sample for the alpha parameter param_grid = {'alpha': sp_rand()} # create and fit a ridge regression model, testing random alpha values model = Ridge() rsearch = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=100) rsearch.fit(X, y) print(rsearch) # summarize the results of the random parameter search print(rsearch.best_score_) print(rsearch.best_estimator_.alpha)
def optimize_params(): """ 一项更加困难的任务是构建一个有效的方法用于选择正确的参数,我们需要用搜索的方法来确定参数 """ import numpy as np from sklearn import datasets from sklearn.linear_model import Ridge from sklearn.grid_search import GridSearchCV iris = datasets.load_iris() X = iris.data y = iris.target # prepare a range of alpha values to test alphas = np.array([1, 0.1, 0.01, 0.001, 0.0001, 0]) # create and fit a ridge regression model, testing each alpha model = Ridge() grid = GridSearchCV(estimator=model, param_grid=dict(alpha=alphas)) # print(dict(alpha=alphas)) grid.fit(X, y) print(grid) # summarize the results of the grid search print(grid.best_score_) print(grid.best_estimator_.alpha) # 有时随机从给定区间中选择参数是很有效的方法,然后根据这些参数来评估算法的效果进而选择最佳的那个 from scipy.stats import uniform as sp_rand from sklearn.grid_search import RandomizedSearchCV # prepare a uniform distribution to sample for the alpha parameter param_grid = {"alpha": sp_rand()} # create and fit a ridge regression model, testing random alpha values model = Ridge() rsearch = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=100) rsearch.fit(X, y) print(rsearch) # summarize the results of the random parameter search print(rsearch.best_score_) print(rsearch.best_estimator_.alpha)
def contentsbased1(user_id, movie_id, genres_p): print('======== 전체영화 예상평점 - 장르 ===========') print('START TIME : ', str(datetime.now())[10:19]) start = time.time() conn = pymysql.connect(host=config('HOST'), port=3306, user=config('USER'), password=config('PASSWORD'), db=config('DB')) sql = 'SELECT * FROM wouldyouci.accounts_rating where user_id=' + str( user_id) ratings = pd.read_sql_query(sql, conn) genres = genres_p conn.close() user_profile = ratings.merge(genres, left_on='movie_id', right_index=True) model = Lasso() param_grid = {'alpha': sp_rand()} research = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=20, cv=5, random_state=406) research.fit(user_profile[genres.columns], user_profile['score']) predictions = research.best_estimator_.predict(genres) genres.reset_index() genres['predict'] = predictions predicted_score = genres.at[movie_id, 'predict'] print('END TIME : ', str(datetime.now())[10:19]) end = time.time() print('TOTAL TIME : ', end - start) print('PREDICTED SCORE : ', predicted_score) print() return pd.DataFrame.to_json(genres['predict'])
def get_params(classifier_name): params_dt = { 'max_features': sp_rand(0.1, 0.9), 'min_samples_leaf': sp_randint(1, 21), 'min_samples_split': sp_randint(2, 21), 'criterion': ['entropy', 'gini'] } params_random_forest = { 'n_estimators': sp_randint(1, 100), 'bootstrap': [True, False], 'max_features': sp_rand(0.1, 0.9), 'min_samples_leaf': sp_randint(1, 21), 'min_samples_split': sp_randint(2, 21), 'criterion': ['entropy', 'gini'] } params_adaboost = { 'base_estimator__max_depth': sp_randint(1, 11), 'algorithm': ['SAMME', 'SAMME.R'], 'n_estimators': sp_randint(50, 501), 'learning_rate': sp_rand(0.01, 2) } params_gboosting = { 'learning_rate': sp_rand(0.01, 1), 'criterion': ['friedman_mse', 'mse'], 'n_estimators': sp_randint(50, 501), 'max_depth': sp_randint(1, 11), 'min_samples_split': sp_randint(2, 21), 'min_samples_leaf': sp_randint(1, 21), 'max_features': sp_rand(0.1, 0.9) } params_log_regression = { 'penalty': ['l2', 'none'], 'solver': ['newton-cg', 'lbfgs', 'sag', 'saga'], 'C': sp_rand(2**(-5), 2**15) } parameter_space = { 'DT': params_dt, 'RF': params_random_forest, 'AB': params_adaboost, 'GB': params_gboosting, 'LR': params_log_regression } return parameter_space[classifier_name]
def guessYouLikeIt(self, userId): model = Lasso() # 모델 설정 param_grid = {'alpha': sp_rand()} rsearch = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=200, cv=20, random_state=42) YOU = my_ratings[my_ratings['userId'] == userId] # 유저 설정, userID 값으로 YOU 설정 rsearch.fit(YOU[genre_cols], YOU['rating']) # 장르 칼럼 rsearch.best_estimator_.alpha intercept = rsearch.best_estimator_.intercept_ coef = rsearch.best_estimator_.coef_ you_profile = pd.DataFrame( [intercept, *coef], # 유저 profile 생성. 장르별 계수 index=['intercept', *genre_cols], columns=['score']) predictions = rsearch.best_estimator_.predict(genres) genres['YOU'] = predictions rating_predictions = genres[~genres.index.isin(YOU['movieId'] )].sort_values( 'YOU', ascending=False) rating_predictions = rating_predictions.merge( movies[['movieId', 'title']], left_index=True, right_on='movieId') Top5 = rating_predictions.sort_values(by='YOU', ascending=False)[:5] # 추천 TOP 5 Worst5 = rating_predictions.sort_values( by='YOU', ascending=True)[:5] # 비추천 WORST 5 print( Top5) # 예상 별점! 이를 토대로 can show best , worst or whatever something
def logisicRegression(x, y, folds): n_iter_search = 50 skf = StratifiedKFold(folds) clf = LogisticRegression(random_state=7) pipe = make_pipeline(preprocessing.StandardScaler(), clf) # Hyper parameters C_range = sp_rand() penalty_options = ['l1', 'l2'] solver = ['liblinear', 'saga'] max_iter = sp_randint(1, 3000) param_dist = dict(logisticregression__C=C_range, logisticregression__penalty=penalty_options, logisticregression__solver=solver, logisticregression__max_iter=max_iter) random_search = RandomizedSearchCV(pipe, param_distributions=param_dist, n_iter=n_iter_search, cv=skf, scoring='roc_auc') random_search.fit(x, y) tuned_params = random_search.best_params_ print tuned_params # Tuned Hyper parameters penalty_options = tuned_params['logisticregression__penalty'] C_range = tuned_params['logisticregression__C'] solver = tuned_params['logisticregression__solver'] max_iter = tuned_params['logisticregression__max_iter'] print('\nLR Best Tuned Model') tuned_clf = LogisticRegression(penalty=penalty_options, C=C_range, random_state=7, solver=solver, max_iter=max_iter) modelEvaluator(tuned_clf, x, y, folds)
def contentsbased_onscreen(user_id): genres = MoviesConfig.genre_pickle ratings = pd.DataFrame( list(Rating.objects.filter(user=user_id).values('score', 'movie_id'))) user_profile = ratings.merge(genres, left_on='movie_id', right_index=True) model = Lasso() param_grid = {'alpha': sp_rand()} research = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=30, cv=5, random_state=406) research.fit(user_profile[genres.columns], user_profile['score']) predictions = research.best_estimator_.predict(genres) genres.reset_index() genres['predict'] = predictions onscreen_id_set = Movie.objects.exclude(onscreens=None).exclude( genres=None) onscreen_id_set = onscreen_id_set.values_list('id', flat=True) score_info = [] for _id in onscreen_id_set: try: score_info.append((_id, genres.at[_id, 'predict'])) except KeyError: print(f'{_id}가 피클에 없어요. genre 피클을 업데이트 해야합니다.') score_info = sorted(score_info, key=lambda x: -x[1])[:10] onscreen_id_set = [x for x, y in score_info] return onscreen_id_set
def main(): # print(sys.argv[0]) # print(sys.argv[1]) # print("cani cani? hi cani?") perfumes = pd.read_csv('perfumes-test-re.csv') types_dummies = perfumes['type'].str.get_dummies(sep="|") types_dummies.to_pickle('types.p') my_ratings = pd.read_csv('added-rating.csv') types = pd.read_pickle('types.p') my_ratings_came = my_ratings.merge(perfumes, on='perfumeId').merge( types, left_on='perfumeId', right_index=True) my_ratings = my_ratings_came user3 = my_ratings_came[my_ratings_came['userId'] == 3] type_cols = types.columns model = Lasso() param_grid = {'alpha': sp_rand()} rsearch = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=100, cv=13, random_state=42) rsearch.fit(user3[type_cols], user3['rating']) intercept = rsearch.best_estimator_.intercept_ coef = rsearch.best_estimator_.coef_ user3_profile = pd.DataFrame([intercept, *coef], index=['intercept', *type_cols], columns=['score']) predictions = rsearch.best_estimator_.predict(types) types['user3'] = predictions #type csv에 column 'user3'추가 rating_predictions = types[~types.index. isin(user3['perfumeId'])].sort_values( 'user3', ascending=False) #user 3이 평가하지 않은 향수 ratings_predictions = rating_predictions.merge( perfumes[['perfumeId', 'name']], left_index=True, right_on='perfumeId') print(ratings_predictions.head()) #test = pd.read_csv('test12.csv') #print(test) # print("data_comp") sys.stdout.flush()
def adaBoostClassifier(x, y, folds): print("\nAda Boost Classifier -Best Tuned Model") clf = AdaBoostClassifier(random_state=7) pipe = make_pipeline(preprocessing.StandardScaler(), clf) skf = StratifiedKFold(folds) n_iter_search = 50 # Hyper parameters n_estimators = sp_randint(1, 300) learning_rate = sp_rand() algorithm = ['SAMME', 'SAMME.R'] param_dist = dict(adaboostclassifier__n_estimators=n_estimators, adaboostclassifier__learning_rate=learning_rate, adaboostclassifier__algorithm=algorithm) random_search = RandomizedSearchCV(pipe, param_distributions=param_dist, n_iter=n_iter_search, cv=skf, scoring='roc_auc', verbose=1) random_search.fit(x, y) tuned_params = random_search.best_params_ print tuned_params # Tuned Hyper parameters n_estimators = tuned_params['adaboostclassifier__n_estimators'] learning_rate = tuned_params['adaboostclassifier__learning_rate'] algorithm = tuned_params['adaboostclassifier__algorithm'] print('\nAB Best Tuned Model') tuned_clf = AdaBoostClassifier(n_estimators=n_estimators, learning_rate=learning_rate, algorithm=algorithm, random_state=7) modelEvaluator(tuned_clf, x, y, folds)
def search(examples, fd_train, eg_train, iterations): """ beginnnings of hyperparameter search for svm """ param_grid = {'C': sp_rand()} lessons_train = list() outcomes_train = list() for _ in tnrange(examples): cameras_train = eg_train.generate() match_id = get_match_id(cameras_train) goods, bads = make_good_bad(cameras_train, match_id) make_work(fd_train, lessons_train, outcomes_train, goods, 1) make_work(fd_train, lessons_train, outcomes_train, bads, 0) clf = svm.SVC() print('searching') start = time.time() rsearch = RandomizedSearchCV( estimator=clf, param_distributions=param_grid, n_iter=iterations) rsearch.fit(lessons_train, outcomes_train) end = time.time() print('searching took {} seconds'.format(end - start)) print(rsearch.best_score_) print(rsearch.best_estimator_.C)
def tune_parameters(self, texts, classes): # Help: # http://scikit-learn.org/stable/modules/grid_search.html#tuning-the-hyper-parameters-of-an-estimator # http://machinelearningmastery.com/how-to-tune-algorithm-parameters-with-scikit-learn/ self.logger.debug("Start parameter tuning...") model = Pipeline([ ('vect', self.vectorizer), ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42, n_jobs=-1)), ]) # Implement more! #parameters = {'clf__alpha': (1e-1, 1e-4) } #gs_clf = GridSearchCV(model, parameters, n_jobs=-1) #gs_clf = gs_clf.fit(texts, classes) #param_grid = {'alpha': sp_rand()} parameters = {'clf__alpha': sp_rand()} # create and fit a ridge regression model, testing random alpha values # use all Cores! gs_clf = RandomizedSearchCV(model, parameters, 1000, n_jobs=-1) gs_clf = gs_clf.fit(texts, classes) self.logger.info("Best Score: " + str(gs_clf.best_score_)) self.logger.info("Best fitting Parameters: ") for param_name in sorted(parameters.keys()): self.logger.info("%s: %r" % (param_name, gs_clf.best_params_[param_name])) self.logger.debug("Finished parameter tuning...")
for candidate in candidates: print("Model with rank: {0}".format(i)) print("Mean validation score: {0:.3f} (std: {1:.3f})".format( results['mean_test_score'][candidate], results['std_test_score'][candidate])) print("Parameters: {0}".format(results['params'][candidate])) print("") # In[ ]: logistic = linear_model.LogisticRegression(penalty='l2') #the hyper-parameters include the number of features, type of regularization #and the alpha parameter of the regularizer param_grid = {'penalty': ['l1', 'l2'], 'C': sp_rand()} num_iter = 2000 rand_search_cv = RandomizedSearchCV(logistic, param_distributions=param_grid, n_iter=num_iter) start = time() rand_search_cv.fit(x_df, y_df) print("RandomizedSearchCV took %.2f seconds for %d candidates" " parameter settings." % ((time() - start), num_iter)) report(rand_search_cv.cv_results_) logistic.set_params(**rand_search_cv.best_params_) #note the ** means to unpack all the iterations in the named argument #(either in dictionary form or in named pair eg "c" = 0.1231)
"regression__n_estimators": sp_randint(100, 200), "regression__max_features": ["auto", "sqrt", "log2"], "regression__loss": ["ls", "lad", "huber", "quantile"], "regression__warm_start": [False, True], }, # GradientBoostingRegressor { "regression__n_estimators": sp_randint(50, 150), "regression__base_estimator": [DecisionTreeRegressor(), GradientBoostingRegressor(), SVR()], "regression__loss": ["linear", "square", "exponential"], }, # AdaBoostRegressor {"regression__kernel": ["linear", "poly", "rbf", "sigmoid"], "regression__C": sp_randint(1, 100)}, # SVR ] feature_sel_parameters = [ dict( feat_sel__alpha=["aic", "bic"], feat_sel__selection_threshold=sp_rand(0.15, 0.20), feat_sel__scaling=sp_rand(0.35, 0.4), feat_sel__sample_fraction=sp_rand(0.6, 0.3), ), dict(feat_sel__k=sp_randint(10, 90)), dict(feat_sel__n_estimators=sp_randint(10, 90)), dict(feat_sel__n_features_to_select=sp_randint(10, 90), feat_sel__step=sp_randint(1, 10)), ] else: raise Exception("seach_method must be grid or randomized!") models = [ RandomForestRegressor(random_state=rand_state), ExtraTreesRegressor(random_state=rand_state), GradientBoostingRegressor(random_state=rand_state), AdaBoostRegressor(random_state=rand_state), SVR(),
def optimize_params(self): params_grid = {"update_learning_rate": sp_rand(0.001,0.01), "momentum":sp_rand(0.9,2.0), "epochs":randint(50,300)} rsearch = RandomizedSearchCV(estimator=self.net, param_distributions=params_grid, n_iter=15, n_jobs=1) X, _, y, _ = self.roof_loader.load(max_roofs=100, test_percent=0, non_roofs=self.non_roofs) rsearch.fit(X,y) Experiment.report_grid(rsearch.grid_scores_)
import numpy as np from sklearn.linear_model import Ridge from sklearn.grid_search import GridSearchCV # prepare a range of alpha values to test alphas = np.array([1, 0.1, 0.01, 0.001, 0.0001, 0]) # create and fit a ridge regression model, testing each alpha model = Ridge() grid = GridSearchCV(estimator=model, param_grid=dict(alpha=alphas)) grid.fit(X, y) print(grid) # summarize the results of the grid search print(grid.best_score_) print(grid.best_estimator_.alpha) import numpy as np from scipy.stats import uniform as sp_rand from sklearn.linear_model import Ridge from sklearn.grid_search import RandomizedSearchCV # prepare a uniform distribution to sample for the alpha parameter param_grid = {"alpha": sp_rand()} # create and fit a ridge regression model, testing random alpha values model = Ridge() rsearch = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=100) rsearch.fit(X, y) print(rsearch) # summarize the results of the random parameter search print(rsearch.best_score_) print(rsearch.best_estimator_.alpha)
def main(): np.random.seed(8) options = ['d_lstm', 'd_cnn'] #'/home/andy/Desktop/MIMIC/temp/pretrain/...' try: with open('/home/andy/Desktop/MIMIC/temp/pretrain/x_train.pkl', 'rb') as f: X_train = pickle.load(f) with open('/home/andy/Desktop/MIMIC/temp/pretrain/x_test.pkl', 'rb') as f: X_test = pickle.load(f) with open('/home/andy/Desktop/MIMIC/temp/pretrain/y_train.pkl', 'rb') as f: Y_train = pickle.load(f) with open('/home/andy/Desktop/MIMIC/temp/pretrain/y_test.pkl', 'rb') as f: Y_test = pickle.load(f) with open('/home/andy/Desktop/MIMIC/temp/pretrain/v_train.pkl', 'rb') as f: V_train = pickle.load(f) with open('/home/andy/Desktop/MIMIC/temp/pretrain/v_test.pkl', 'rb') as f: V_test = pickle.load(f) with open('/home/andy/Desktop/MIMIC/temp/pretrain/t_train.pkl', 'rb') as f: t_train = pickle.load(f) with open('/home/andy/Desktop/MIMIC/temp/pretrain/t_test.pkl', 'rb') as f: t_test = pickle.load(f) SG = gensim.models.Word2Vec.load( '/home/andy/Desktop/MIMIC/temp/pretrain/SG') print("Training sets loaded.") except: with open('/home/andy/Desktop/MIMIC/temp/admits.pkl', 'rb') as f: admits = pickle.load(f) with open('/home/andy/Desktop/MIMIC/temp/d.pkl', 'rb') as f: d = pickle.load(f) with open('/home/andy/Desktop/MIMIC/temp/lib.pkl', 'rb') as f: lib = pickle.load(f) with open('/home/andy/Desktop/MIMIC/temp/sentences.pkl', 'rb') as f: sentences = pickle.load(f) print("Splitting dataset...") X_train, X_test, V_train, V_test, t_train, t_test, Y_train, Y_test = get_split( admits=admits, sentences=sentences, lib=lib, dz=d) with open('/home/andy/Desktop/MIMIC/temp/pretrain/x_train.pkl', 'wb') as f: pickle.dump(X_train, f) with open('/home/andy/Desktop/MIMIC/temp/pretrain/x_test.pkl', 'wb') as f: pickle.dump(X_test, f) with open('/home/andy/Desktop/MIMIC/temp/pretrain/v_train.pkl', 'wb') as f: pickle.dump(V_train, f) with open('/home/andy/Desktop/MIMIC/temp/pretrain/v_test.pkl', 'wb') as f: pickle.dump(V_test, f) with open('/home/andy/Desktop/MIMIC/temp/pretrain/y_train.pkl', 'wb') as f: pickle.dump(Y_train, f) with open('/home/andy/Desktop/MIMIC/temp/pretrain/y_test.pkl', 'wb') as f: pickle.dump(Y_test, f) with open('/home/andy/Desktop/MIMIC/temp/pretrain/t_train.pkl', 'wb') as f: pickle.dump(t_train, f) with open('/home/andy/Desktop/MIMIC/temp/pretrain/t_test.pkl', 'wb') as f: pickle.dump(t_test, f) print("Making Dictionary...") #V_test = [np.ndarray.tolist(i) for i in V_test] #exons = [i[2] for i in sentences if i[2] not in V_test] del sentences V_train = [np.ndarray.tolist(i) for i in V_train] #Do NOT forget the previous step; it is very important to convert sentence to regular python list... otherwise it'll take forever. #SG = gensim.models.Word2Vec(sentences = exons, sg = 1, size = 300, window = 10, min_count = int(len(exons)*.001), hs = 1, negative = 0) SG = gensim.models.Word2Vec(sentences=V_train, sg=1, size=300, window=10, hs=1, negative=0) print("...saving dictionary...") SG.save("/home/andy/Desktop/MIMIC/temp/pretrain/SG") #fixed embedding layers weights = SG.wv.syn0 vocab = dict([(k, v.index) for k, v in SG.wv.vocab.items()]) w2i, i2w = vocab_index(vocab) w_train = [ list(map(lambda i: w2i[i] if i in w2i.keys() else 0, vv)) for vv in V_train ] random = True if random == False: Data = [] preset = {'optimizer': 'Adam', 'learn_rate': .005} optimizer = ['SGD', 'RMSprop', 'Adam'] learn_rate = [.0001, .0005, .001, .005, .01] #momentum = np.arange(.5, .9, .1).tolist() #neurons = [100] dropout_W = [0.001, .01, .1, .2, .4] dropout_U = [0.001, .01, .1, .2, .4] #W_regularizer = [l1(.0001), l1(.001), l1(.01), l2(.0001), l2(.001), l2(.01), None] #U_regularizer = [l1(.0001), l1(.001), l1(.01), l2(.0001), l2(.001), l2(.01), None] init_mode = [ 'uniform', 'lecun_uniform', 'normal', 'zero', 'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform' ] for o in options: t1 = TIME.time() param_grid = dict(learn_rate=learn_rate) data = grid_search(x=X_train, y=Y_train, v=V_train, t=t_train, SG=SG, option=o, nb_epoch=20, cv=3, n_jobs=1, param_grid=param_grid, preset=preset) with open( "/home/tangfeng/MIMIC/results/randgrid_" + str(o) + ".pkl", 'wb') as f: pickle.dump(data, f) print("Pickle successful!") t2 = TIME.time() print("Training completed in " + str((t2 - t1) / 3600) + " hours") Data += data # Data = pd.DataFrame([pd.Series(dd) for dd in Data]) with open("/home/tangfeng/MIMIC/results/gridsearch.pkl", 'wb') as f: pickle.dump(Data, f) print("Done.") else: Data = [] optimizer = ['SGD', 'RMSprop', 'Adam'] learn_rate = sp_rand(.0001, .01) momentum = sp_rand(.5, .9) dropout_W = sp_rand(0, .5) dropout_U = sp_rand(0, .5) #W_regularizer = [l1(.0001), l1(.001), l1(.01), l2(.0001), l2(.001), l2(.01), None] #U_regularizer = [l1(.0001), l1(.001), l1(.01), l2(.0001), l2(.001), l2(.01), None] init_mode = [ 'uniform', 'lecun_uniform', 'normal', 'zero', 'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform' ] for o in options: preset = {} if o == 'lr': param_grid = { 'C': sp_rand(.0001, 1000), 'penalty': ('l1', 'l2') } elif o == 'svm': param_grid = {'C': sp_rand(.0001, 1000)} elif o == 'rf': param_grid = { 'criterion': ['gini', 'entropy'], 'n_estimators': sp_randint(10, 50), 'bootstrap': [True, False] } else: param_grid = dict(optimizer=optimizer, learn_rate=learn_rate, momentum=momentum, dropout_W=dropout_W, dropout_U=dropout_U, init_mode=init_mode) if o == 'd_lstm' or o == 'd_cnn': trainable = False else: trainable = True t1 = TIME.time() data = random_search(x=X_train, y=Y_train, v=w_train, t=t_train, weights=weights, option=o, nb_epoch=16, cv=3, n_jobs=1, param_grid=param_grid, preset=preset, n_iter=40, trainable=trainable) t2 = TIME.time() with open( "/home/andy/Desktop/MIMIC/results/random_" + str(o) + ".pkl", 'wb') as f: pickle.dump(data, f) print("Pickle successful!") print("Training completed in " + str((t2 - t1) / 3600) + " hours") Data += data with open("/home/andy/Desktop/MIMIC/results/randomsearch.pkl", 'wb') as f: pickle.dump(Data, f) print("Done.")
input_shape=(None, num_features), hidden_num_units=200, # number of units in hidden layer #!200-600 output_nonlinearity=lasagne.nonlinearities.softmax, # output layer output_num_units=num_classes, # 10 target values dropout_p=0.2, #!dropout 0.2-0.7 # optimization method: update=nesterov_momentum, update_learning_rate=0.01,#!0.001-0.01 update_momentum=0.9,#!0.6-0.9 regression=False, # flag to indicate we're dealing with regression problem max_epochs=500, # we want to train this many epochs verbose=1, ) random_search = RandomizedSearchCV(net1, {'hidden_num_units': sp_randint(200, 600), "dropout_p": sp_rand(0.2,0.7), "update_learning_rate": sp_rand(0.001, 0.01), "update_momentum": sp_rand(0.6, 0.9), }) random_search.fit(X, y) print random_search.grid_scores_ preds = random_search.predict_proba(X_test)[:, 1] submission = pd.DataFrame(preds, index=ids, columns=['target']) submission.to_csv('Keras_BTB.csv')
def set_hyperparameters(self): self.p_grid = {'C': sp_rand()}
def Random_classifier(a, b): N_range_KNN = sp_randint(1, 31) C_LOGSVM = sp_rand() n_estimators_RF = sp_randint(10, 10000) hidden_layer_sizes_MLP = sp_randint(100, 1000) gamma_range_Random_SVM = sp_rand() alpha_NB = sp_rand() params_Random_KNN = dict(n_neighbors=N_range_KNN) params_Random_tree = dict(criterion=criterion_list_TREE, splitter=splitter_TREE, min_samples_split=min_samples_split_TRF, max_features=max_features_TRF) params_Random_SGD = dict(loss=loss_list_SGD, penalty=penalty_list_SGD) params_Random_LOG = dict(penalty=penalty_LOG, C=C_LOGSVM, solver=solver_LOG, multi_class=multi_class_LOG) params_Random_RF = dict(n_estimators=n_estimators_RF, criterion=criterion_RF, min_samples_split=min_samples_split_TRF, max_features=max_features_TRF) params_Random_MLP = dict(hidden_layer_sizes=hidden_layer_sizes_MLP, activation=activation_MLP, solver=solver_MLP, learning_rate=learning_rate_MLP) params_Random_SVM = dict(gamma=gamma_range_Random_SVM, C=C_LOGSVM, kernel=kernel_list_SVM) params_Random_NB = dict(alpha=alpha_NB) params_Random_GB = dict(loss=loss_GB, criterion=criterion_GB, min_samples_split=min_samples_split_TRF) params_Random_ADA = dict(algorithm=algorithm_ADA) params_Random_XG = dict(C=C_XG) params_Random_BAG = dict(max_samples=max_samples_BAG, max_features=max_features_TRF) params_Random_LDA = dict(solver=solver_LDA) params_Random_ET = dict(criterion=criterion_list_TREE, max_features=max_features_TRF, min_samples_split=min_samples_split_TRF) Random_KNN = RandomizedSearchCV(KNeighborsClassifier(), params_Random_KNN) Random_tree = RandomizedSearchCV(DecisionTreeClassifier(), params_Random_tree) Random_SGD = RandomizedSearchCV(SGDClassifier(), params_Random_SGD) Random_LOG = RandomizedSearchCV(LogisticRegression(), params_Random_LOG) Random_RF = RandomizedSearchCV(RandomForestClassifier(), params_Random_RF) Random_MLP = RandomizedSearchCV(MLPClassifier(), params_Random_MLP) Random_SVM = RandomizedSearchCV(SVC(), params_Random_SVM) Random_NB = RandomizedSearchCV(BernoulliNB(), params_Random_NB) Random_GB = GridSearchCV(GradientBoostingClassifier(), params_Random_GB) Random_ADA = GridSearchCV(AdaBoostClassifier(), params_Random_ADA) Random_XG = GridSearchCV(XGBClassifier(), params_Random_XG) Random_BAG = GridSearchCV(BaggingClassifier(), params_Random_BAG) Random_LDA = GridSearchCV(LinearDiscriminantAnalysis(), params_Random_LDA) Random_ET = GridSearchCV(ExtraTreesClassifier(), params_Random_ET) Randoms = [ Random_KNN, Random_tree, Random_SGD, Random_LOG, Random_RF, Random_MLP, Random_SVM, Random_NB, Random_GB, Random_ADA, Random_XG, Random_BAG, Random_LDA, Random_ET ] list_Random = [] for ran in Randoms: try: dict_Random = {} ran.fit(a, b) dict_Random['Best_Estimator'] = ran.best_estimator_ dict_Random['Accuracy'] = ran.best_score_ list_Random.append(dict_Random) except: pass Best_Classifier_Random = max(list_Random, key=lambda x: x['Accuracy']) return Best_Classifier_Random
'max_depth': 6, 'gamma': 2.213, 'learning_rate': 0.273, 'max_delta_step': 1.444, 'subsample': 0.847 } num_round = 75 plst = param.items() # specify validations set to watch performance watchlist = [(dcv, 'eval'), (dtrain, 'train')] bst_search = xgb.XGBClassifier() clf = RandomizedSearchCV(bst_search, { 'max_depth': sp_randint(1, 13), 'learning_rate': sp_rand(0, 1), 'gamma': sp_rand(0, 3), 'subsample': sp_rand(0, 1), 'max_delta_step': sp_rand(0, 3), 'n_estimators': [ 5, 10, 15, 20, 25, 30, 35, 40,
def random_search(x, y, v, t, weights, top_words=9444, max_review_length=1000, embedding_length=300, batch_size=128, nb_epoch=16, cv=3, n_jobs=1, option='d_cnn', param_grid={}, preset={}, n_iter=40, trainable=False): x = sequence.pad_sequences(x, maxlen=max_review_length) v = sequence.pad_sequences(v, maxlen=max_review_length) data = [] x = np.array(x) #convert to numpy form before splitting v = np.array(v) y = np.array(y) t = np.array(t) for sess in range(n_iter): print("Session: {0}".format(sess)) for key, value in param_grid.items(): try: preset.update({key: random.choice(value)}) except: preset.update({key: value.rvs(1)[0]}) decay = sp_rand(0, 0.00001).rvs(1)[0] decay_factors = [[ math.exp(-1 * decay * elapse.total_seconds() / 3600) for elapse in tt ] for tt in t] decay_factors = sequence.pad_sequences(decay_factors, maxlen=max_review_length) shape = decay_factors.shape decay_factors = decay_factors.reshape(shape[0], shape[1], 1) if option == 'lstm' or option == 'd_lstm': if trainable == True: preset.update({ 'top_words': top_words, 'max_length': max_review_length, 'embedding_length': embedding_length }) else: preset.update( { 'top_words': weights.shape[0], 'max_length': max_review_length, 'embedding_length': weights.shape[1] }, trainable=False, weights=weights) model = lstm_train(**preset) classic = False elif option == 'cnn' or option == 'd_cnn': if trainable == True: preset.update( { 'top_words': top_words, 'max_length': max_review_length, 'embedding_length': embedding_length }, trainable=True) else: preset.update( { 'top_words': weights.shape[0], 'max_length': max_review_length, 'embedding_length': weights.shape[1] }, trainable=False, weights=weights) model = cnn_train(**preset) classic = False elif option == 'lr': preset.update({'verbose': 1}) model = LogisticRegression(**preset) classic = True elif option == 'svm': preset.update({'verbose': True}) model = SVC(**preset) classic = True elif option == 'rf': preset.update({'verbose': 1}) model = RandomForestClassifier(**preset) classic = True print(preset) skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=8) cvscore = [] for train, test in skf.split(x, y): x_train, x_test = x[train], x[test] y_train, y_test = y[train], y[test] t_train, t_test = t[train], t[test] v_train, v_test = v[train], v[test] if classic == True: model.fit(decay_norm(x=v_train, t_stamps=t_train, decay=decay), y_train) score = model.score( decay_norm(x=v_test, t_stamps=t_test, decay=decay), y_test) print("%s: %.2f%%" % ("accuracy", score * 100)) cvscore.append(score * 100) elif trainable == True: model.fit(x_train, y_train, batch_size=batch_size, nb_epoch=nb_epoch, verbose=1) score = model.evaluate(x_test, y_test, batch_size=batch_size, verbose=1) print("%s: %.2f%%" % (model.metrics_names[1], score[1] * 100)) cvscore.append(score[1] * 100) else: model.fit(x=[v_train, decay_factors], y=y_train, batch_size=batch_size, nb_epoch=nb_epoch, verbose=1) score = model.evaluate(x_test, y_test, batch_size=batch_size, verbose=1) print("%s: %.2f%%" % (model.metrics_names[1], score[1] * 100)) cvscore.append(score[1] * 100) temp = {'model': option, 'decay': decay} temp.update(preset) temp.update({'mean_score': np.mean(cvscore), 'std': np.std(cvscore)}) data.append(temp) return (data)
input_shape=(None, num_features), hidden_num_units=200, # number of units in hidden layer #!200-600 output_nonlinearity=lasagne.nonlinearities.softmax, # output layer output_num_units=num_classes, # 10 target values dropout_p=0.2, #!dropout 0.2-0.7 # optimization method: update=nesterov_momentum, update_learning_rate=0.01, #!0.001-0.01 update_momentum=0.9, #!0.6-0.9 regression= False, # flag to indicate we're dealing with regression problem max_epochs=500, # we want to train this many epochs verbose=1, ) random_search = RandomizedSearchCV( net1, { 'hidden_num_units': sp_randint(200, 600), "dropout_p": sp_rand(0.2, 0.7), "update_learning_rate": sp_rand(0.001, 0.01), "update_momentum": sp_rand(0.6, 0.9), }) random_search.fit(X, y) print random_search.grid_scores_ preds = random_search.predict_proba(X_test)[:, 1] submission = pd.DataFrame(preds, index=ids, columns=['target']) submission.to_csv('Keras_BTB.csv')
def set_hyperparameters(self): self.p_grid = { 'n_estimators': sp_randint(1, 300), 'learning_rate': sp_rand(), 'algorithm': ['SAMME', 'SAMME.R'] }
test_data = df_test_df.values test_ids = df_test['bidder_id'].map(lambda x: inv_bid_id_dict[x]).values dtrain = xgb.DMatrix( train_data, label=outcome_train) dcv = xgb.DMatrix( cv_data, label=outcome_cv) dtest = xgb.DMatrix( test_data, ) param = {'objective':'binary:logistic','eval_metric':'auc','max_depth':6, 'gamma': 2.213, 'learning_rate':0.273,'max_delta_step': 1.444, 'subsample': 0.847} num_round = 75 plst = param.items() # specify validations set to watch performance watchlist = [(dcv,'eval'), (dtrain,'train')] bst_search = xgb.XGBClassifier() clf = RandomizedSearchCV(bst_search, {'max_depth': sp_randint(1,13), 'learning_rate':sp_rand(0,1), 'gamma':sp_rand(0,3), 'subsample':sp_rand(0,1),'max_delta_step':sp_rand(0,3), 'n_estimators': [5, 10, 15, 20, 25, 30, 35, 40, 50, 75, 100, 125, 150,200,]}, verbose=1, n_jobs=2, cv = 4, scoring='roc_auc', n_iter = 1000) clf.fit(train_data, outcome_train) print('best clf score',clf.best_score_) print('best params:', clf.best_params_) bst = xgb.train(plst, dtrain, num_round, watchlist) # this is prediction preds = bst.predict(dcv) pred_test = bst.predict(dtest) labels = dcv.get_label() print ('error=%f' % ( sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) /float(len(preds)))) print('{0:<25} {1:>5}'.format('Feature','Importance')) print("--------------------------------------")
print(metrics.confusion_matrix(expected, predicted)) print '=== Оптимизация параметров алгоритма ===' # prepare a range of alpha values to test alphas = np.array([1,0.1,0.01,0.001,0.0001,0]) # create and fit a ridge regression model, testing each alpha model = Ridge() grid = GridSearchCV(estimator=model, param_grid=dict(alpha=alphas)) grid.fit(X, y) print(grid) # summarize the results of the grid search print(grid.best_score_) print(grid.best_estimator_.alpha) # prepare a uniform distribution to sample for the alpha parameter param_grid = {'alpha': sp_rand()} # create and fit a ridge regression model, testing random alpha values model = Ridge() rsearch = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=100) rsearch.fit(X, y) print(rsearch) # summarize the results of the random parameter search print(rsearch.best_score_) print(rsearch.best_estimator_.alpha)
'regression__n_estimators': sp_randint(50, 150), 'regression__base_estimator': [DecisionTreeRegressor(), GradientBoostingRegressor(), SVR()], 'regression__loss': ['linear', 'square', 'exponential'] }, # AdaBoostRegressor { 'regression__kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'regression__C': sp_randint(1, 100) } # SVR ] feature_sel_parameters = [ dict(feat_sel__alpha=['aic', 'bic'], feat_sel__selection_threshold=sp_rand(.15, .20), feat_sel__scaling=sp_rand(.35, .4), feat_sel__sample_fraction=sp_rand(.6, .3)), dict(feat_sel__k=sp_randint(10, 90)), dict(feat_sel__n_estimators=sp_randint(10, 90)), dict(feat_sel__n_features_to_select=sp_randint(10, 90), feat_sel__step=sp_randint(1, 10)) ] else: raise Exception('seach_method must be grid or randomized!') models = [ RandomForestRegressor(random_state=rand_state), ExtraTreesRegressor(random_state=rand_state), GradientBoostingRegressor(random_state=rand_state), AdaBoostRegressor(random_state=rand_state), SVR()