def grid_search(X,y): C_array = [0.001, 0.01, 0.1, 1, 10] gamma_array = [0.001, 0.01, 0.1, 1] hyperparameters = {'C': C_array, 'gamma' : gamma_array} grid_search = GridSearchCV(SVC(kernel='rbf'), hyperparameters, cv=10) grid_search.fit(X, y) return grid_search.best_params_.get('C'),grid_search.best_params_.get('gamma')
def train_model(feature_data, target_data, search_parameters): """ Train a model using ExtraTreesClassifier and entropy criterion for split quality: http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html """ # Train model model_pipeline = sklearn.pipeline.Pipeline([ ('scale', sklearn.preprocessing.StandardScaler()), ('feature_select', sklearn.feature_selection.SelectPercentile(sklearn.feature_selection.f_classif)), ('classify', sklearn.ensemble.ExtraTreesClassifier(bootstrap=True, criterion='entropy')) ]) # Create the stratified cross-validation folder; this means that the distribution of the target # within each fold will mirror the overall population up to this point. cv = sklearn.cross_validation.StratifiedKFold(target_data, n_folds=10) # Create grid searcher grid_search = sklearn.grid_search.GridSearchCV( model_pipeline, search_parameters, cv=cv, verbose=0, n_jobs=2, scoring=sklearn.metrics.make_scorer( sklearn.metrics.fbeta_score, beta=1.0, pos_label=pandas.Series(target_data).value_counts().idxmax() ) ) # Fit model in grid search grid_search.fit(feature_data, target_data) return grid_search
def optimize_svm(train_data_path='training-data-small.txt.bz2'): """Run grid search to determine best C or gamma for svm Generally, C ranges from 1 to 1000, and gamma is no larger than 0.1. :type train_data_path: str :params train_data_path: the path to training data file :type return: dict :params return: best params obtained by grid search """ from sklearn import grid_search from sklearn import metrics # load training data train_X, train_y = load_data(data_path=train_data_path) # config the range of C and gamma in grid search param_grid = [{'C': [2**i for i in range(0, 10, 1)], # 1 <= C <= 1000 'gamma': [2**i for i in np.arange(-8, -3, 0.5)], # 0 < gamma <= 0.1 'kernel': ['rbf']}, {'C': [2**i for i in range(0, 10, 1)], # 1 <= C <= 1000 'kernel': ['linear']}] method = SVC() grid_search = grid_search.GridSearchCV(method, param_grid, scoring='f1', n_jobs=9) grid_search.fit(train_X, train_y) return grid_search.best_params_
def gradient_boosting_exp(X, y, data, split_iterator, base_classifier=None): # if base_classifier: # X = sklearn.preprocessing.scale(X) hyperparameter_space = { "learning_rate": [0.2167], "min_samples_leaf": [10], "n_estimators": [300], "subsample": [0.9], } # learning_rate: numpy.linspace(0.1, 0.9, 5) # min_samples_leaf: numpy.linspace(5, 40, 5).astype(int) # min_samples_split: numpy.linspace(20, 100, 5).astype(int) # n_estimators: 100, 200, etc (default = 100) # max_depth: 2, 3, 4, 5 (default = 3) # subsample: 0.5, 0.8, 0.9, 1. (default = 1) model = sklearn.ensemble.GradientBoostingClassifier(init=base_classifier) grid_search = sklearn.grid_search.GridSearchCV(model, hyperparameter_space, n_jobs=N_JOBS, cv=split_iterator, verbose=1) grid_search.fit(X, y) if base_classifier: print "Gradient boosting with base classifier" else: print "Gradient boosting classifier" print_tuning_scores(grid_search) print_feature_importances(data.drop("IsBlueWinner", axis=1).columns, grid_search.best_estimator_)
def svc_param_selection(self,X, y, nfolds): Cs = [0.001, 0.01, 0.1, 1, 10] gammas = [0.001, 0.01, 0.1, 1] param_grid = {'C': Cs, 'gamma' : gammas} grid_search = GridSearchCV(svm.SVC(kernel='rbf'), param_grid, cv=nfolds, n_jobs=4, verbose=1) grid_search.fit(X, y) grid_search.best_params_ return grid_search.best_params_
def get_svm_param(X, y, nfolds): #function for hyperparameter tuning for SVM Cs = [0.001, 0.01, 0.1, 1, 10] gammas = [0.001, 0.01, 0.1, 1] param_grid = {'C': Cs, 'gamma': gammas} grid_search = GridSearchCV(svm.SVC(kernel='rbf'), param_grid, cv=nfolds) grid_search.fit(X, y) gets = grid_search.best_params_ return gets
def svc_paramter_selection(X, y, nfolds): cs = [0.001, .01, 0.1, 1, 10] gammas = [.001, .01, .1, 1] param_grid = {'C': cs, 'gamma': gammas} grid_search = GridSearchCV(svm.SVC(kernel='rbf'), param_grid, cv=nfolds) grid_search.fit(X, y) grid_search.best_params_ return grid_search.best_params_
def svc_param_selection(X, y, nfolds): Cs = [0.001, 0.01, 0.1, 1, 1.1, 2, 3, 10] gammas = [0.001, 0.01, 0.1, 1] #kernels = [‘linear’, ‘rbf’, ‘poly’] param_grid = {'C': Cs, 'gamma': gammas} grid_search = GridSearchCV(svm.SVC(kernel='rbf'), param_grid, cv=nfolds) grid_search.fit(X, y) grid_search.best_params_ return grid_search, grid_search.best_params_
def svc_param_selection(X, y, nfolds): Cs = [0.1, 1, 10, 100] gammas = [0.001, 0.01, 0.1] param_grid = {'C': Cs, 'gamma': gammas} grid_search = GridSearchCV(svm.SVC(kernel='sigmoid'), param_grid, cv=nfolds) grid_search.fit(X, y) grid_search.best_params_ return grid_search.best_params_, grid_search.cv_results_
def validation(data, target, constant): score = 0 regressor = svm.NuSVR(kernel="poly") param_grid = { 'C': np.linspace(20.0, 40.0, 10), 'nu': np.linspace(0.0001, 1, 5) } grid_search = sklearn.grid_search.GridSearchCV( regressor, param_grid, scoring=sklearn.metrics.make_scorer(sklearn.metrics.mean_squared_error, greater_is_better=False), cv=5, n_jobs=-1) grid_search.fit(data, target) clf = grid_search.best_estimator_ print(clf) chunk_size = len(data) / CVSize for x in range(CVSize): # These describe where to cut to get our crossdat first_step = x * chunk_size second_step = (x + 1) * chunk_size # Get the data parts we train on cross_data = np.vstack((data[:first_step], data[second_step:])) cross_target = np.append(target[:first_step], target[second_step:]) # fit and save the coef clf.fit(cross_data, cross_target) # Find mean squared error and print it sample_data = data[first_step:second_step] sample_target = target[first_step:second_step] # Get scores for our model pred = clf.predict(sample_data) RMSE = mean_squared_error(sample_target, pred)**0.5 score += RMSE score = score / CVSize print("Cross-Validation RMSE: {} ".format(score)) # Get global score clf.fit(data, target) pred = clf.predict(data) RMSE = mean_squared_error(target, pred)**0.5 print("RMSE on whole dataset {}".format(RMSE)) return score
def main(): # Data Pre-Processing: Join the username table and service log table df1 = pd.read_csv("NewForm1.csv") df2 = pd.read_csv("serviceExecutionLog_dataset2.csv") df3 = pd.merge(df1, df2, on = ['userName', 'executionStartTime'], how = 'left') # Uppercase transformation df3['model'] = df3['model'].map(str.upper) # Write out to csv file df3.to_csv("NewForm1WithExecutionTime.csv") # Data Pre-Processing: Join the Climate Dataset table to feature to train df4 = pd.read_csv("/Users/dennis/Documents/SVM-Tasks/Climate_Datasets.csv") # Encoding: Grouping df4['Dataset Group'] = df4['Dataset Group'].map(datasetgrouping) # Duplicate & Fillna df4['userName'] = df4['userName'].fillna('Unknown') df4['Users Group'] = df4['userName'] df4['Users Group'] = df4['Users Group'].map(usergrouping) # Write out to FeaturesForTrain.csv df4.to_csv("FeaturesForTrain.csv") # Training/Testing Data and split Preparation X, y = df4.astype(str).map(str.strip), df4['userName'].as_matrix() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4) # Pipeline building pipeline = Pipeline([('vect', TfidfVectorizer(stop_words = 'english', lowercase = False)), ('clf', SVC(kernel=['rbf', 'linear'], gamma=0.01, C=100, max_iter = 100))]) # Check the training data shape print X_train.shape # parameters setting parameters={'clf__gamma':(0.01, 0.02, 0.1, 0.3, 1), 'clf__C':(0.1, 0.3, 1, 3, 10, 30), } # training with grid_search: parameters fillin grid_search = GridSearchCV(pipeline, parameters, n_jobs=3, verbose=1, scoring='accuracy') # training with grid_search with X_train data grid_search.fit(X_train, y_train) grid_search = GridSearchCV(pipeline, parameters, n_jobs=3, verbose=1, scoring='accuracy') # Predictions predictions = grid_search.predict(X_test) predictions_probability = grid_search.predict_proba(X_test) # Prediction Results print 'Accuracy:', accuracy_score(y_test, predictions) print 'Confusion Matrix:', confusion_matrix(y_test, predictions) print 'Classification Report:', classification_report(y_test, predictions)
def svc_param_selection(self, X, y, nfolds): data = self.data Cs = [0.001, 0.01, 0.1, 1, 10, 100] gammas = [0.001, 0.01, 0.1, 1, 10] param_grid = {'C': Cs, 'gamma': gammas} grid_search = GridSearchCV(svm.SVC(kernel='rbf'), param_grid, cv=nfolds) grid_search.fit(X, y) print(grid_search.best_params_) return grid_search.best_params_
def svc_param_selection(X, y, nfolds): a = [0.4, 0.5, 0.6, 0.7] c = [1, 1.5, 2, 2.5, 3] d = [1, 2, 3] param_grid = {'alpha': a, 'coef0': c, 'degree': d} grid_search = GridSearchCV(KernelRidge(kernel='polynomial'), param_grid, cv=nfolds) grid_search.fit(X, y) grid_search.best_params_ return grid_search.best_params_
def validation(data, target, constant): score = 0 #regressor = svm.NuSVR(kernel = "rbf", cache_size=1500, tol=1e-2) regressor = sklearn.linear_model.LassoLarsCV() #regressor = GradientBoostingClassifier() #param_grid = {'C':np.linspace(25.0 , 20000.0, num = 4), 'gamma':np.linspace(0.01/15.0, 0.5/15.0, 16)} #param_grid = {'eps':np.linspace(0.0001, 0.002, 4)} param_grid = {} grid_search = sklearn.grid_search.GridSearchCV( regressor, param_grid, scoring=sklearn.metrics.make_scorer(sklearn.metrics.mean_squared_error, greater_is_better=False), n_jobs=9) grid_search.fit(data, target) clf = grid_search.best_estimator_ print(clf) chunk_size = len(data) / CVSize for x in range(CVSize): # These describe where to cut to get our crossdat first_step = x * chunk_size second_step = (x + 1) * chunk_size # Get the data parts we train on cross_data = np.vstack((data[:first_step], data[second_step:])) cross_target = np.append(target[:first_step], target[second_step:]) # fit and save the coef clf.fit(cross_data, cross_target) # Find mean squared error and print it sample_data = data[first_step:second_step] sample_target = target[first_step:second_step] # Get scores for our model pred = clf.predict(sample_data) RMSE = mean_squared_error(sample_target, pred)**0.5 score += RMSE score = score / CVSize print("Cross-Validation RMSE: {} ".format(score)) # Get global score clf.fit(data, target) pred = clf.predict(data) RMSE = mean_squared_error(target, pred)**0.5 print("RMSE on whole dataset {}".format(RMSE)) return score
def svc_param_selection(self, X, y, nfolds): Cs = [0.001, 0.01, 0.1, 1, 10] gammas = [0.001, 0.01, 0.1, 1] param_grid = {'C': Cs, 'gamma': gammas} grid_search = GridSearchCV(svm.SVC(kernel='rbf'), param_grid, cv=nfolds, n_jobs=4, verbose=1) grid_search.fit(X, y) grid_search.best_params_ return grid_search.best_params_
def svc_param_selection(X, y, nfolds): print('Using GridSearchCV for tuning the hyperparameters...') print() print('This might take some time (approx 5-10 mins)...') C_range = np.logspace(-2, 10, 13) gamma_range = np.logspace(-9, 3, 13) param_grid = {'C': C_range, 'gamma' : gamma_range} model = svm.SVC(kernel = 'rbf') grid_search = GridSearchCV(model, param_grid, cv=nfolds) grid_search.fit(X, y) print("Best Params") print(grid_search.best_params_) return grid_search.best_params_
def svc_param_selection(dataset, nfolds): # outcome and parameters y = np.array([x[yPos] for x in dataset]) X = np.array([x[0:yPos] for x in dataset]) Cs = [0.001, 0.01, 0.1, 1, 10] gammas = [0.001, 0.01, 0.1, 1] kernels = ["linear", "rbf"] param_grid = {'C': Cs, 'gamma': gammas, 'kernel': kernels} grid_search = GridSearchCV(svm.SVC(), param_grid, cv=nfolds) grid_search.fit(X, y) grid_search.best_params_ return grid_search.best_params_
def random_forest(X, y, data, split_iterator): hyperparameter_space = { "n_estimators": [150], "min_samples_split": [50], "min_samples_leaf": [7] } model = sklearn.ensemble.RandomForestClassifier() grid_search = sklearn.grid_search.GridSearchCV(model, hyperparameter_space, n_jobs=N_JOBS, cv=split_iterator, verbose=1) grid_search.fit(X, y) print "Random forest" print_tuning_scores(grid_search) print_feature_importances(data.drop("IsBlueWinner", axis=1).columns, grid_search.best_estimator_)
def neural_network(X, y, data, split_iterator): X = sklearn.preprocessing.scale(X) print "Neural network" hyperparameter_space = { "hidden_layer_sizes": [(75,)], "dropout": [0.5] } model = classifiers.NnWrapper(dropout=0.5, show_accuracy=True, batch_spec=((100, 1024), (100, -1))) grid_search = sklearn.grid_search.GridSearchCV(model, hyperparameter_space, n_jobs=3, verbose=1) grid_search.fit(X, y) print_tuning_scores(grid_search)
def SVM_Ranking_Model_Extraction_And_Encoding(): # Pandas readin Training Samples Training_Table_Raw = pd.read_csv("FeatureToTrainWithoutTester.csv") Training_Table_Raw = Training_Table_Raw.drop(['Unnamed: 0', 'Unnamed: 0.1', 'Dataset Start Time', 'Dataset End Time', 'executionStartTime', 'Dataset Group', 'Users Group'], axis = 1) Training_Table = Training_Table_Raw.copy() # Feature Encoding Training_Table = transform_features(Training_Table) # Training/Testing DataSet Split Train_Test_Split = Training_Table.copy() X, y = Train_Test_Split.drop('userName', axis = 1), Train_Test_Split['userName'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4) # SVM configuration parameters={'clf__gamma':(0.01, 0.02, 0.1, 0.3, 1), 'clf__C':(0.1, 0.3, 1, 3, 10, 30), } pipeline = Pipeline([('clf', SVC(kernel='rbf', gamma=0.01, C=100, max_iter = 100, probability = True))]) grid_search = GridSearchCV(pipeline, parameters, n_jobs=2, verbose=1, scoring='accuracy') Grid_Fit = grid_search.fit(X_train, y_train) predictions = grid_search.predict(X_test) Top_N_Recommendder = Accumulation(Training_Table_Raw, Training_Table, Grid_Fit) # Prediction Results print 'Accuracy:', accuracy_score(y_test, predictions) print 'Confusion Matrix:', confusion_matrix(y_test, predictions) print 'Classification Report:', classification_report(y_test, predictions) return Top_N_Recommendder
def test_cv_pipeline(self): pipeline = SKL_Pipeline([ ('vect', SKL_HashingVectorizer(n_features=20)), ('tfidf', SKL_TfidfTransformer(use_idf=False)), ('lasso', SKL_Lasso(max_iter=1)) ]) parameters = { 'lasso__alpha': (0.001, 0.005, 0.01) } grid_search = GridSearchCV(self.sc, pipeline, parameters) data = [('hi there', 0.0), ('what is up', 1.0), ('huh', 1.0), ('now is the time', 5.0), ('for what', 0.0), ('the spark was there', 5.0), ('and so', 3.0), ('were many socks', 0.0), ('really', 1.0), ('too cool', 2.0)] df = self.sql.createDataFrame(data, ["review", "rating"]).toPandas() skl_gs = grid_search.fit(df.review.values, df.rating.values) assert len(skl_gs.grid_scores_) == len(parameters['lasso__alpha']) # TODO for gs in skl_gs.grid_scores_: pass # assert(gs.)
def test_cv_lasso_with_mllib_featurization(self): data = [('hi there', 0.0), ('what is up', 1.0), ('huh', 1.0), ('now is the time', 5.0), ('for what', 0.0), ('the spark was there', 5.0), ('and so', 3.0), ('were many socks', 0.0), ('really', 1.0), ('too cool', 2.0)] data = self.sql.createDataFrame(data, ["review", "rating"]) # Feature extraction using MLlib tokenizer = Tokenizer(inputCol="review", outputCol="words") hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=20000) pipeline = Pipeline(stages=[tokenizer, hashingTF]) data = pipeline.fit(data).transform(data) df = self.converter.toPandas(data.select(data.features.alias("review"), "rating")) pipeline = SKL_Pipeline([ ('lasso', SKL_Lasso(max_iter=1)) ]) parameters = { 'lasso__alpha': (0.001, 0.005, 0.01) } grid_search = GridSearchCV(self.sc, pipeline, parameters) skl_gs = grid_search.fit(df.review.values, df.rating.values) assert len(skl_gs.cv_results_['params']) == len(parameters['lasso__alpha'])
def test_cv_lasso_with_mllib_featurization(self): data = [('hi there', 0.0), ('what is up', 1.0), ('huh', 1.0), ('now is the time', 5.0), ('for what', 0.0), ('the spark was there', 5.0), ('and so', 3.0), ('were many socks', 0.0), ('really', 1.0), ('too cool', 2.0)] data = self.sql.createDataFrame(data, ["review", "rating"]) # Feature extraction using MLlib tokenizer = Tokenizer(inputCol="review", outputCol="words") hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=20000) pipeline = Pipeline(stages=[tokenizer, hashingTF]) data = pipeline.fit(data).transform(data) df = self.converter.toPandas(data.select(data.features.alias("review"), "rating")) pipeline = SKL_Pipeline([ ('lasso', SKL_Lasso()) ]) parameters = { 'lasso__alpha': (0.001, 0.005, 0.01) } grid_search = GridSearchCV(self.sc, pipeline, parameters) skl_gs = grid_search.fit(df.review.values, df.rating.values) assert len(skl_gs.cv_results_['params']) == len(parameters['lasso__alpha'])
def run_gridsearch(X, y, clf, cv=10): param_grid = { "criterion": ["gini", "entropy"], "min_samples_split": [10, 15, 20, 40], "max_depth": [10, 15, 30], "min_samples_leaf": [30, 40, 50, 55, 100], "max_leaf_nodes": [35, 50, 60], "min_samples_leaf": [15, 20, 30] } grid_search = GridSearchCV(clf, param_grid=param_grid, cv=cv) start = time() grid_search.fit(X, y) print(("\nGridSearchCV took {:.2f} " "seconds for {:d} candidate " "parameter settings.").format(time() - start, len(grid_search.grid_scores_))) top_params = report(grid_search.grid_scores_, 3) return top_params
def test_cv_linreg(self): pipeline = SKL_Pipeline([('lasso', SKL_Lasso(max_iter=1))]) parameters = {'lasso__alpha': (0.001, 0.005, 0.01)} grid_search = GridSearchCV(self.sc, pipeline, parameters) X = scipy.sparse.vstack( map(lambda x: self.list2csr([x, x + 1.0]), range(0, 100))) y = np.array(list(range(0, 100))).reshape((100, 1)) skl_gs = grid_search.fit(X, y) assert len(skl_gs.cv_results_['params']) == len( parameters['lasso__alpha'])
def run_gridsearch(X, y, clf, cv=10): param_grid = {"criterion": ["gini", "entropy"], "min_samples_split": [10,15,20,40], "max_depth": [10,15,30], "min_samples_leaf": [30,40,50,55,100], "max_leaf_nodes": [35,50,60], "min_samples_leaf" : [15,20,30]} grid_search = GridSearchCV(clf, param_grid=param_grid, cv=cv) start = time() grid_search.fit(X, y) print(("\nGridSearchCV took {:.2f} " "seconds for {:d} candidate " "parameter settings.").format(time() - start, len(grid_search.grid_scores_))) top_params = report(grid_search.grid_scores_, 3) return top_params
def decision_tree(X, y, data, split_iterator, dot_filename=None): hyperparameter_space = { "max_depth": [5, 10, 20], "min_samples_split": [25, 50, 100], "min_samples_leaf": [5, 10, 50] } model = sklearn.tree.DecisionTreeClassifier(max_depth=5) grid_search = sklearn.grid_search.GridSearchCV(model, hyperparameter_space, n_jobs=N_JOBS, cv=split_iterator, verbose=1) grid_search.fit(X, y) print "Decision tree tuning" print_tuning_scores(grid_search) # refit a shallow tree for visualization if dot_filename: model = sklearn.tree.DecisionTreeClassifier(max_depth=5) model.fit(X, y) sklearn.tree.export_graphviz(model, dot_filename, feature_names=data.drop("IsBlueWinner", axis=1).columns)
def test_cv_linreg(self): pipeline = SKL_Pipeline([ ('lasso', SKL_Lasso(max_iter=1)) ]) parameters = { 'lasso__alpha': (0.001, 0.005, 0.01) } grid_search = GridSearchCV(self.sc, pipeline, parameters) X = scipy.sparse.vstack(map(lambda x: self.list2csr([x, x+1.0]), range(0, 100))) y = np.array(list(range(0, 100))).reshape((100,1)) skl_gs = grid_search.fit(X, y) assert len(skl_gs.cv_results_['params']) == len(parameters['lasso__alpha'])
def train_model(feature_data, target_data, search_parameters): ''' Train a model using ExtraTreesClassifier and entropy criterion for split quality: http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html ''' # Train model model_pipeline = sklearn.pipeline.Pipeline([ ('scale', sklearn.preprocessing.StandardScaler()), ('feature_select', sklearn.feature_selection.SelectPercentile( sklearn.feature_selection.f_classif)), ('classify', sklearn.ensemble.ExtraTreesClassifier( bootstrap=True, criterion='entropy', )) ]) # Create the stratified cross-validation folder; this means that the distribution of the target # within each fold will mirror the overall population up to this point. cv = sklearn.cross_validation.StratifiedKFold(target_data, n_folds=10) # Create grid searcher grid_search = sklearn.grid_search.GridSearchCV(model_pipeline, search_parameters, scoring=sklearn.metrics.make_scorer(sklearn.metrics.fbeta_score, beta=1.0, pos_label=pandas.Series( target_data) \ .value_counts().idxmax()), cv=cv, verbose=0, n_jobs=2) # Fit model in grid search grid_search.fit(feature_data, target_data) return grid_search
def feature_training(X_data, y_data, feature_name): # Retrieve the features X_Train, y_Train = X_data[feature_name].reshape(len(X_data[feature_name]), 1), y_data # Configuring the parameters parameters={'clf__gamma':(0.01, 0.02, 0.1, 0.3, 1), 'clf__C':(0.1, 0.3, 1, 3, 10, 30), } pipeline = Pipeline([('clf', SVC(kernel='linear', gamma=0.01, C=100, max_iter = 10))]) grid_search = GridSearchCV(pipeline, parameters, n_jobs=2, verbose=1, scoring='accuracy') model = grid_search.fit(X_Train, y_Train) return model
pcfilter = sklearn.feature_selection.SelectPercentile(sklearn.feature_selection.f_classif, percentile=100) pipeline = sklearn.pipeline.Pipeline([('filter', pcfilter), ('clf', clf)]) parameters = {'filter__percentile': [50, 60, 40, 70, 30, 80, 20]} scorer = sklearn.metrics.make_scorer(sklearn.metrics.log_loss, greater_is_better=False, needs_proba=True) grid_search = sklearn.grid_search.GridSearchCV(pipeline, parameters, scoring=scorer, n_jobs=1, cv=4, verbose=1) print("Performing grid search...") print("pipeline:", [name for name, _ in pipeline.steps]) print("parameters:") pprint(parameters) t0 = time.time() grid_search.fit(XX_train, yy_train) print("done in %0.3fs" % (time.time() - t0)) print() print("Best score: %0.3f" % grid_search.best_score_) print("Best parameters set:") best_parameters = grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print("\t%s: %r" % (param_name, best_parameters[param_name])) try: joblib.dump(grid_search, '/disk/data1/s1145806/train3_model.pkl') except: pass
pcfilter = sklearn.feature_selection.SelectPercentile(sklearn.feature_selection.f_classif, percentile=100) pipeline = sklearn.pipeline.Pipeline([('filter', pcfilter), ('clf', clf)]) parameters = {'filter__percentile': [75, 80, 85, 90, 95, 100]} scorer = sklearn.metrics.make_scorer(sklearn.metrics.log_loss, greater_is_better=False, needs_proba=True) grid_search = sklearn.grid_search.GridSearchCV(pipeline, parameters, scoring=scorer, n_jobs=1, cv=4, verbose=1) print("Performing grid search...") print("pipeline:", [name for name, _ in pipeline.steps]) print("parameters:") pprint(parameters) t0 = time.time() grid_search.fit(XX_train, yy_train) print("done in %0.3fs" % (time.time() - t0)) print() print("Best score: %0.3f" % grid_search.best_score_) print("Best parameters set:") best_parameters = grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print("\t%s: %r" % (param_name, best_parameters[param_name])) try: joblib.dump(grid_search, '/disk/data1/s1145806/train_model_refined.pkl') except: pass
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV param_dist = {"base_estimator__max_depth": [1,2,3], "base_estimator__min_samples_split": [1,2], "base_estimator__min_samples_leaf": [1,2], "n_estimators": [2,3,5], "learning_rate":[0.4,0.6,0.8], "algorithm":["SAMME","SAMME.R"] } cv = cross_validation.StratifiedShuffleSplit(y_train,n_iter = 4,random_state = 9) f1score=make_scorer(f1_score, pos_label="yes") # build a classifier dt_clf=DecisionTreeClassifier() clf = AdaBoostClassifier(dt_clf) # run grid search grid_search = GridSearchCV(clf, param_grid=param_dist,cv=cv,scoring=f1score) gs_estimator=grid_search.fit(X_train,y_train) print "Best model parameter: " + str(gs_estimator.best_params_) y_pred=grid_search.predict(X_test) #print y_pred gs_f1score=f1_score(y_test, y_pred,pos_label="yes") print "f1 score: {:.5f}".format(gs_f1score) # ######
#temp = np.vstack([bowDiction.compute(im,orb.detect(im)), h]) descriptor.append(tempy) for y in range (0,18): if(train_names[y] in x): label.append(y) #svm = SVC(C = 4, gamma = 0.4) svm = SVC() #svm.fit(np.array(descriptor), np.array(label)) #svm = LinearSVC(random_state=0) #svm.fit(np.array(descriptor),np.array(label)) #svc = SVC() param_grid = [ {'C': [2.1,2.3,2.5],'gamma':[0.40,0.45,0.50]}] grid_search = GridSearchCV(svm, param_grid = param_grid) grid_search.fit(np.array(descriptor), np.array(label)) svm = grid_search print ("three") joblib.dump(orb, 'model_sift.pkl',protocol=2) joblib.dump(svm, 'model_svm.pkl', protocol=2) #Save Model joblib.dump(dic, 'bow_dic.pkl', protocol=2) confusion = np.zeros((18,18)) count = 0 matrix = [] for x in train_name_path: im = cv2.imread(x,0) if im is not None: hog = cv2.HOGDescriptor() #hog.winSize = Size(16,32) img = cv2.resize(im,(64,128)) temp = bowDiction.compute(img,orb.detect(img))
print("Testing set Accuracy ", accuracy_score(y_test, y_pred.round(), normalize=True)) # XXX # TODO: Tune the hyper-parameters 'C' and 'kernel' (use rbf and linear). # Print the best params, using .best_params_, and print the best score, using .best_score_. # Get the training and test set accuracy values after hyperparameter tuning. # XXX Cs = [1, 10, 100] kernels = ['linear', 'rbf'] param_grid = {'C': Cs, 'kernel': kernels} grid_search = GridSearchCV(SVC(), param_grid=param_grid, cv=5) grid_search.fit(rescaledX, y_data) print("best params ", grid_search.best_params_) print("best score ", grid_search.best_score_) tuningpredict = grid_search.predict(X_test) print("Train Accuracy ", accuracy_score(y_train, grid_search.predict(X_train))) print("Test Accuracy ", accuracy_score(y_test, tuningpredict.round(), normalize=True)) svclassifier2 = SVC(kernel='linear', C=1) svmd2 = svclassifier2.fit(X_train, y_train) y_pred2 = svmd2.predict(X_test) print("Train Accuracy ", accuracy_score(y_train, svmd2.predict(X_train))) print("Test Accuracy ", accuracy_score(y_test, y_pred2.round(),
def getClassifier(data, target): score = 0 temp = 0 # Classifier to use in BaggingClassifier classifier1 = ensemble.ExtraTreesClassifier(min_samples_split=3, n_estimators=10, max_features=4) # Classifier for GridSearch classifier = ensemble.BaggingClassifier(classifier1) # Params param_grid = {'n_estimators': range(5, 25)} #param_grid = {'n_estimators' : np.linspace(10,11, num = 2)} # GridSearch grid_search = sklearn.grid_search.GridSearchCV( classifier, param_grid, scoring=sklearn.metrics.make_scorer(accuracy_score), cv=5, n_jobs=4) grid_search.fit(data, target) clf = grid_search.best_estimator_ # Print Estimator print(clf) # Print Cross of Validations Scores print(cross_val_score(clf, data, target, cv=5, scoring='accuracy')) # Print Mean of Cross Validations Scores temp = np.mean(cross_val_score(clf, data, target, cv=5, scoring='accuracy')) print("Built-in Cross-Validation: {} ".format(temp)) # Martins Version of Cross Validation chunk_size = len(data) / CVSize for x in range(CVSize): # These describe where to cut to get our crossdat first_step = x * chunk_size second_step = (x + 1) * chunk_size # Get the data parts we train on cross_data = np.vstack((data[:first_step], data[second_step:])) cross_target = np.append(target[:first_step], target[second_step:]) # fit and save the coef clf.fit(cross_data, cross_target) # Find mean squared error and print it sample_data = data[first_step:second_step] sample_target = target[first_step:second_step] # Get scores for our model pred = clf.predict(sample_data) RMSE = accuracy_score(sample_target, pred) score += RMSE score = score / CVSize print("Cross-Validation RMSE: {} ".format(score)) # Get global score #clf.fit(data, target) #pred = clf.predict(data) #RMSE = accuracy_score(target, pred) #print("RMSE on whole dataset {}".format(RMSE)) # Return estimator/classifier return clf
svm = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=42) svm.fit(X_train_dtm, Y_train) y_pred_class_svm = svm.predict(X_test_dtm) print(metrics.accuracy_score(Y_test, y_pred_class_svm),"SVM-SGD -countvectorizer") svm_t = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=42) svm_t.fit(X_train_tfidf, Y_train) y_pred_svm_t = svm_t.predict(X_test_tfidf) print(metrics.accuracy_score(Y_test, y_pred_svm_t),"SVM-SGD -tfidf") #grid print("grid") from sklearn import svm, grid_search Cs = [0.001, 0.01, 0.1, 1, 10] gammas = [0.001, 0.01, 0.1, 1] param_grid = {'C': Cs, 'gamma' : gammas, 'kernel':('poly', 'rbf')} grid_search = GridSearchCV(svm.SVC(), param_grid, cv=5) grid_search.fit(X_train_dtm, Y_train) print(grid_search.best_score_) print(grid_search.best_params_) y_grid_search_svm = grid_search.predict(X_test_dtm) print(metrics.accuracy_score(Y_test,y_grid_search_svm),"grid search- SVM") ''' #X_train, X_test, y_train, y_test = train_test_split(corpus, labels, random_state=1,train_size=0.90) #X_train_tfidf, vectorizer = generate_features(X_train) #X_test_tfidf, vectorizer = generate_features(X_test)
def run(self, label_groupings, data_splits, feature_extractors, estimators, scores, cv=5, n_jobs=1, to_csv=None): self.label_groupings = label_groupings self.data_splits = data_splits self.feature_extractors = feature_extractors self.estimators = estimators self.scores = scores # total amount of grid search settings total_settings = (len(label_groupings) * len(feature_extractors) * len(estimators) * len(scores)) # the current the grid search settings current_setting = 0 for l_key, l_settings in label_groupings.items(): for f_key, f_settings in feature_extractors.items(): for e_key, e_settings in estimators.items(): for s_key, s_settings in scores.items(): current_setting += 1 logger.info("Running setting " + str(current_setting) + "/" + str(total_settings) + ": " + str(l_settings.title) + " | " + str(f_settings.title) + " | " + str(e_settings.title) + " | " + str(s_settings.title)) pipeline = sklearn.pipeline.Pipeline([ ('vect', f_settings.vectorizer), ('clf', e_settings.estimator) ]) parameters = {} parameters.update(f_settings.parameter_space) parameters.update(e_settings.parameter_space) grid_search_cv = sklearn.grid_search.GridSearchCV grid_search = grid_search_cv(pipeline, parameters, scoring=s_settings.score, cv=cv, n_jobs=n_jobs) grid_search.fit(self.data_splits[l_key].X_train, self.data_splits[l_key].Y_train) score_train_ = grid_search.best_score_ score_test_ = grid_search.score( self.data_splits[l_key].X_test, self.data_splits[l_key].Y_test) hash_key = hash(l_key + f_key + e_key + s_key) self.results[hash_key] = [ l_key, l_settings.title, l_settings, f_key, f_settings.title, f_settings, e_key, e_settings.title, e_settings, s_key, s_settings.title, s_settings, score_train_, score_test_, grid_search.best_estimator_, grid_search.best_params_, grid_search.grid_scores_ ] # update results resultrows = [] for key, result in self.results.items(): resultrows.append(result) self.results_table = pd.DataFrame(resultrows, columns=self.columns) if (to_csv): self.results_to_csv(to_csv) return self.results_table
#print(X_test.shape, y_test.shape) #print(rfe.score(X_test,y_test)) ########################################################### Logistic Regression with grid search ############################################################################## #p = pd.read_pickle('C:\\Users\\Tawsif Sazid\\Desktop\\lala.xls') #print(first.dtype) Cs = [0.001, 0.01, 0.1, 1, 10, 100, 1000, 2000, .0001, 1E6, .00000001] #solver = ['newton-cg','sag'] ##gammas = [0.001, 0.01, 0.1, 1, 10 , 100, 200, 1000, 2000, .0001] gamma lagbe naa max_iter = [100,1000,10000] param_grid = {"C": Cs, 'max_iter': max_iter} mul_lr = linear_model.LogisticRegression(multi_class='multinomial', solver='newton-cg') grid_search = GridSearchCV(estimator=mul_lr,param_grid=param_grid,cv=12,refit=True) grid_search.fit(yoo1,y) print(grid_search.best_score_) ####import statsmodels.api as sm ''' mul_lr = linear_model.LogisticRegression(multi_class='multinomial', solver='newton-cg').fit(X_train, y_train) print("Logistic Regression") print(mul_lr.predict(X_test)) print(mul_lr.score(X_test,y_test)) ''' ######################################################### Perform 6-fold cross validation ######################################################################### ''' # Necessary imports: from sklearn.cross_validation import cross_val_score, cross_val_predict from sklearn import metrics
results_recall = np.array([]) results_f1 = np.array([]) for train_indices, test_indices in kf: features_train= [features[ii] for ii in train_indices] features_test= [features[ii] for ii in test_indices] labels_train=[labels[ii] for ii in train_indices] labels_test=[labels[ii] for ii in test_indices] grid_search = GridSearchCV(pipeline, param_grid = param_grid, scoring = scoring_index, n_jobs=1) clf = grid_search.fit(features_train, labels_train) pred = clf.best_estimator_.predict(features_test) print 'Best Estimator >>> ', grid_search.best_estimator_ print 'Best score >>> ', grid_search.best_score_ print 'Best scorer >>> ', grid_search.scorer_ print 'Best best parameters >>> ', grid_search.best_params_ results_acc = np.append(results_acc, [accuracy_score(labels_test,pred)], axis=0) results_precision = np.append(results_precision, [precision_score(labels_test,pred)], axis=0) results_recall = np.append(results_recall, [recall_score(labels_test,pred)], axis=0) results_f1 = np.append(results_f1, [f1_score(labels_test,pred)], axis=0) print '>>>>>>>>>>', clfNames print "avg precision : ", np.array(results_precision).mean() print "avg recall : ", np.array(results_recall).mean()
#p = pd.read_pickle('C:\\Users\\Tawsif Sazid\\Desktop\\lala.xls') #print(first.dtype) Cs = [0.001, 0.01, 0.1, 1, 10, 100, 1000, 2000, .0001, 1E6, .00000001] #solver = ['newton-cg','sag'] ##gammas = [0.001, 0.01, 0.1, 1, 10 , 100, 200, 1000, 2000, .0001] gamma lagbe naa max_iter = [100, 1000, 10000] param_grid = {"C": Cs, 'max_iter': max_iter} mul_lr = linear_model.LogisticRegression(multi_class='multinomial', solver='newton-cg') grid_search = GridSearchCV(estimator=mul_lr, param_grid=param_grid, cv=12, refit=True) grid_search.fit(yoo1, y) print(grid_search.best_score_) ####import statsmodels.api as sm ''' mul_lr = linear_model.LogisticRegression(multi_class='multinomial', solver='newton-cg').fit(X_train, y_train) print("Logistic Regression") print(mul_lr.predict(X_test)) print(mul_lr.score(X_test,y_test)) ''' ######################################################### Perform 6-fold cross validation ######################################################################### ''' # Necessary imports: from sklearn.cross_validation import cross_val_score, cross_val_predict from sklearn import metrics scores = cross_val_score(mul_lr, first, y, cv=10)
# print grid_search.grid_scores_ # print grid_search.best_estimator_ #Best estimator was C=0.5. #Re-centering search. # parameters = {'C':[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]} # grid_search = grid_search.GridSearchCV(model_svm, parameters) # grid_search.fit(X_train, Y_train) # print grid_search.grid_scores_ # print grid_search.best_estimator_ #Best estimator was C=0.5. #Narrowing down by order of magnitude. parameters = {'C':[0.45, 0.46, 0.47, 0.48, 0.49, 0.50, 0.51, 0.52, 0.53, 0.54, 0.55]} grid_search = grid_search.GridSearchCV(model_svm, parameters) grid_search.fit(X_train, Y_train) print grid_search.grid_scores_ print grid_search.best_estimator_ #Best estimator was C=0.45. Because we already compared 0.4 to 0.5 two searches above, and 0.5 was selected, we induce that 0.45 is the optimal value without searching between 0.40 and 0.45. #Returning model results with optimal 'C' value. expected = Y_test predicted = grid_search.predict(X_test) print classification_report(expected, predicted) print metrics.confusion_matrix(expected, predicted) print metrics.accuracy_score(expected, predicted) #Support Vector Machine: Model fit, transform, and testing with optimized 'C' value splits = cv.train_test_split(X_train_tfidf, dataset.target, test_size=0.2) X_train, X_test, Y_train, Y_test = splits
def classification(FV_N): " PCA reduction dimension & Random Forest Classification" pca = decomposition.PCA() RFC = RandomForestClassifier() estimators = [('reduce_dim', pca), ('Random_Forest', RFC)] pipe = Pipeline(estimators) # Search the best parameters for the classification #for i in range(100,700,100): # cc=[i]+cc #nb_tree=[] #random_st=[] #for i in range(50,350,50): # nb_tree=[i]+nb_tree # random_st=[0]+random_st cc = [70, 80, 90] nb_tree = [200, 200, 200] random_st = [0, 0, 0] aa = [100, 200, 300] cc = [] params = dict(reduce_dim__n_components=cc, Random_Forest__n_estimators=nb_tree, Random_Forest__random_state=random_st) grid_search = GridSearchCV(pipe, param_grid=params) X = FV_N yr = Get_true_y(Data_FRAMES) filename_yr = projectpath + 'io/Output/yr.npy' np.save(filename_yr, yr) yr = np.load(filename_yr) Data_FRAMES.loc[Data_FRAMES.indice == 1595] X = X[:yr.shape[0]] X.shape yr = yr[:X.shape[0]] np.save(filename_yr, yr) yr = np.load(filename_yr) grid_search.fit(X, yr) print(grid_search.best_estimator_) plt.figure() plt.axvline( grid_search.best_estimator_.named_steps['reduce_dim'].n_components, linestyle=':', label='n_components chosen') plt.legend(prop=dict(size=12)) plt.show() plt.figure() plt.axvline( grid_search.best_estimator_.named_steps['Random_Forest'].n_estimators, linestyle=':', label='n_estimators chosen') plt.legend(prop=dict(size=12)) plt.show() n_est_rdf = grid_search.best_estimator_.named_steps[ 'Random_Forest'].n_estimators n_compo_pca = grid_search.best_estimator_.named_steps[ 'reduce_dim'].n_components pca = decomposition.PCA(n_components=n_compo_pca, svd_solver='auto') pca.fit(X) variance_Ratio = pca.explained_variance_ratio_ plt.figure(1, figsize=(4, 3)) plt.clf() plt.axes([.2, .2, .7, .7]) plt.plot(pca.explained_variance_ratio_.cumsum(), linewidth=1) plt.axis('tight') plt.xlabel('n_components') plt.ylabel('Cumulative Explained variance') M = pca.transform(X) plt.figure() plt.plot(M[yr == 1, 0], M[yr == 1, 1], 'or') plt.title('Astrocytes') plt.figure() plt.plot(M[yr == 2, 0], M[yr == 2, 1], 'ob') plt.title('Neurons') grid_search.predict(X) metrics.accuracy_score(yr, grid_search.predict(X)) RFC = RandomForestClassifier(n_estimators=n_est_rdf, random_state=0) predictedVAL = cross_val_predict(RFC, X, yr, n_jobs=-1) metrics.accuracy_score(yr, predictedVAL) Conf_Mat = confusion_matrix(yr, predictedVAL) import seaborn as sns sns.heatmap(Conf_Mat.T, square=True, annot=True, cbar=False) plt.xlabel('True label') plt.ylabel('predicted label') return ()
# for reproducability clf.set_params(adaboostclassifier__random_state=42) ### Task 5: Tune your classifier to achieve better than .3 precision and recall ### using our testing script. Check the tester.py script in the final project ### folder for details on the evaluation method, especially the test_classifier ### function. Because of the small size of the dataset, the script uses ### stratified shuffle split cross validation. For more info: ### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html import sklearn.grid_search # grid search for best params parameters = {"adaboostclassifier__n_estimators" : (10, 15, 25, 40, 70), "adaboostclassifier__learning_rate" : (.1, .2, .4, .7, 1)} # score recall because that is the lower value for this dataset/ classifier grid_search = sklearn.grid_search.GridSearchCV(clf, parameters, scoring="recall", cv=6) grid_search.fit(features, labels) # apply found params clf.set_params(adaboostclassifier__n_estimators= grid_search.best_params_["adaboostclassifier__n_estimators"], adaboostclassifier__learning_rate= grid_search.best_params_["adaboostclassifier__learning_rate"]) ### Task 6: Dump your classifier, dataset, and features_list so anyone can ### check your results. You do not need to change anything below, but make sure ### that the version of poi_id.py that you submit can be run on its own and ### generates the necessary .pkl files for validating your results. dump_classifier_and_data(clf, my_dataset, features_list)
tfidf = np.hstack([tfidf_0,tfidf_1,tfidf_2,tfidf_3]) tfidf_train = tfidf[:num_train,:] tfidf_test = tfidf[num_train:,:] #df_all = pd.read_csv('df_all_clean.csv',sep='\t', encoding='utf-8') df_all = pd.read_csv('df_all_clean.csv',encoding="ISO-8859-1") df_all = df_all.drop([df_all.columns[0],\ 'search_term','product_title','product_description','product_info','brand','attr'],axis=1) df_train = df_all.iloc[:num_train] y = df_train['relevance'].values X = np.hstack([df_train.drop(['relevance'],axis=1).values,tfidf_train]) y = y[::3] X = X[::3] #print train.shape #print valid.shape #print y_train.shape #print y_valid.shap start_time = time.time() clf = RandomForestRegressor(n_estimators = 10000) from sklearn.grid_search import GridSearchCV param_grid = {"max_depth": [20,25,30,35,40],"min_samples_leaf": [1, 3,7, 10]} grid_search = GridSearchCV(clf, param_grid=param_grid) grid_search.fit(X, y) print grid_search.best_params_ print("--- Training & Testing: %s minutes ---" % round(((time.time() - start_time)/60),2))
def SVM_Ranking_Model_Extraction_And_Encoding(): # Pandas readin Training Samples df = pd.read_csv("FeatureToTrainWithoutTester.csv") df2 = df.copy() df2 = df2.drop(['Dataset Start Time', 'Dataset End Time', 'executionStartTime', 'Dataset Group', 'Users Group'], axis = 1) df2.head() # Feature Encoding transform_features(df2) df2.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis = 1) df2.head() # Encoded Features df = pd.read_csv("Transform_features.csv") # Training/Testing DataSet Split df3 = df.copy() y = df3['userName'] df3 = df3.drop(['userName'], axis = 1) X = df3 X_train, X_test, y_train, y_test = X, X, y, y # SVM configuration parameters={'clf__gamma':(0.01, 0.02, 0.1, 0.3, 1), 'clf__C':(0.1, 0.3, 1, 3, 10, 30), } pipeline = Pipeline([('clf', SVC(kernel='rbf', gamma=0.01, C=100, max_iter = 100, probability = True))]) grid_search = GridSearchCV(pipeline, parameters, n_jobs=2, verbose=1, scoring='accuracy') result2 = grid_search.fit(X_train, y_train) #coef = (result.best_estimator_.get_params()['clf'].coef_) #coef2 = coef_sum(coef) #coef2 index = ['DatasetName', 'Agency', 'Instrument', 'Physical variable', 'var', 'Units', 'Grid Dimension', 'Variable Name in Web Interface', 'model'] # Model Estimation model = [] for i in index: # Features' distance/relevant to category prediction model.append(feature_training(X_train, y_train, i)) # Training data distance to single column PCA weight_set = numpy.zeros((len(X_train), len(index))) for j in range(0, len(X_train)): dict_index = 0 for i in index: # Features' distance/relevant to category prediction model_extraction = model[dict_index] sample = X_train[j:j+1] weight = feature_distance(sample, i, model_extraction) weight_set[j, dict_index] = weight dict_index = dict_index + 1 print "[INFO] Data Points: ", j, "Columns Iteration: ", dict_index print "[INFO] Weight : ", weight if j % 100 == 0: weight_set_file = pd.DataFrame(weight_set.copy()) weight_set_file.to_csv("weight_set.csv") # Delivery: Training data with Label Training_matrix = pd.DataFrame(weight_set.copy()) Training_matrix['Label'] = y_train # SVM Ranking Formatting SVM_Rank_Formatted_Training_data = Training_matrix.copy() for j in range(0, len(X_train)): for i in range(0, 9): SVM_Rank_Formatted_Training_data.ix[j, i] = str(i + 1) + ":" + str(SVM_Rank_Formatted_Training_data.ix[j, i]) SVM_Rank_Formatted_Training_data.ix[j, 'Label'] = str(int(SVM_Rank_Formatted_Training_data.ix[j, 9])) # Columns Reorder Rank_format_columns = SVM_Rank_Formatted_Training_data.columns.tolist() Rank_format_columns = Rank_format_columns[-1:] + Rank_format_columns[:-1] SVM_Rank_Formatted_Training_data = SVM_Rank_Formatted_Training_data[Rank_format_columns] # Write to CSV format SVM_Rank_Formatted_Training_data.to_csv("SVM_Rank_Formatted_Training_data2.dat", index = False, sep = ' ', index_label = False, header = False) SVM_Rank_Formatted_Training_data.to_csv("SVM_Rank_Formatted_Training_data2.csv") predictions = grid_search.predict(X_test) # Prediction Results print 'Accuracy:', accuracy_score(y_test, predictions) print 'Confusion Matrix:', confusion_matrix(y_test, predictions) print 'Classification Report:', classification_report(y_test, predictions)
'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000] }, { 'kernel': ['sigmoid'], 'C': [1, 10, 100, 1000] }, { 'kernel': ['linear'], 'C': [1, 10, 100, 1000] } ] grid_search = GridSearchCV(clf, param_grid=tuned_parameters) grid_search.fit(X_trn, Y_trn) print("GridSearchCV took") report(grid_search.cv_results_) # randomized parameter optimization dist_parameters = { "kernel": ['poly', 'rbf', 'sigmoid', 'linear'], "C": scipy.stats.expon( scale=1000 ), #scipy.stats.expon : An exponential continuous random variable. 즉 0~1사이의 값을 랜덤하게 추출 "degree": scipy.stats.expon(scale=10), # scale = 1/lamda 즉 "gamma": scipy.stats.expon(scale=.1) } n_iter_search = 20