Example #1
0
def search_parameters(data_file):
    with open(data_file, 'r') as f:
            data = pickle.load(f)
            labels = data['labels']
            features = data['features']
    # Split the dataset in two equal parts
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(
    features, labels, test_size=0.5, random_state=0)
    scores = [
    ('error_rate', zero_one_score),]
    
    #classifier = svm.LinearSVC()
    classifier = MultinomialNB()
    
    tuned_parameters = {'alpha' :(0.001, 0.01,0.1,0.5,1,1.5,2,5,10) }
    #tuned_parameters = {'C' :(0.00001, 0.001, 0.01, 0.1,0.5,1,1.5,2,5,10,20,50,100,500,1000)}
    for score_name, score_func in scores:
        print "# Tuning hyper-parameters for %s" % score_name
        print
    
        clf = GridSearchCV(classifier, tuned_parameters, score_func=score_func)
        clf.fit(X_train, y_train, cv=5)
    
        print "Best parameters set found on development set:"
        best_parameters, score,_ = max(clf.grid_scores_, key=lambda x: x[1])
        for param_name in sorted(tuned_parameters.keys()):
            print "%s: %r" % (param_name, best_parameters[param_name])
def test_grid_search_precomputed_kernel():
    """Test that grid search works when the input features are given in the
    form of a precomputed kernel matrix """
    X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)

    # compute the training kernel matrix corresponding to the linear kernel
    K_train = np.dot(X_[:180], X_[:180].T)
    y_train = y_[:180]

    clf = SVC(kernel='precomputed')
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]})
    cv.fit(K_train, y_train)

    assert_true(cv.best_score_ >= 0)

    # compute the test kernel matrix
    K_test = np.dot(X_[180:], X_[:180].T)
    y_test = y_[180:]

    y_pred = cv.predict(K_test)

    assert_true(np.mean(y_pred == y_test) >= 0)

    # test error is raised when the precomputed kernel is not array-like
    # or sparse
    assert_raises(ValueError, cv.fit, K_train.tolist(), y_train)
Example #3
0
	def do_cross_validation(self, param_grid, svmtype, score_func, inputdata_train, outputdata_train, inputdata_test, outputdata_test):
		""" Fitting of classifier used for cross validation """

		if svmtype == 'ln':
			svm_clf = LinearSVC()
		if svmtype == 'rbf':
			svm_clf = SVC()
		#clf_cv = GridSearchCV(SVC(), param_grid, score_func=score_func,  n_jobs=-1 )
		#clf_cv = GridSearchCV( LinearSVC(), param_grid, score_func=score_func,  n_jobs=-1 )
		
		clf_cv = GridSearchCV(svm_clf, param_grid, score_func=score_func,  n_jobs=-1 )

		clf_cv.fit(inputdata_train, outputdata_train)
		y_pred_cv = clf_cv.predict(inputdata_test)

		f1 = metrics.f1_score(outputdata_test, y_pred_cv, pos_label=0)
		dict_param = clf_cv.best_params_
		c = dict_param['C']	

		if svmtype == 'rbf':
			gamma1 = dict_param['gamma']
		else:
			gamma1 = 0


		return(f1, gamma1, c)
Example #4
0
def test_krr_regbeta():
    
    dim = 5
    n = 1000
    ntest = 1001    

    pref = np.random.random(size=dim) - 0.5

    #pref /= np.sqrt(pref.dot(pref))

    Xtrain = np.random.random((n, dim)) 
    ytrain = Xtrain.dot(pref) + np.random.normal(scale=0.05, size=n) + 10.0

    Xtest = np.random.random((ntest, dim)) 
    yref = Xtest.dot(pref) + 10.0

    krr = kRidgeRegression(kernel=Linear(), eta=1.0, regularize_beta=True)
    gs = GridSearchCV(krr, {'eta' : [1E-6, 1E-4, 1E-2, 1, 1E2, 1E4, 1E6]})
    gs.fit(Xtrain, ytrain)

    krr = gs.best_estimator_

    ytest = krr.transform(Xtest).flatten()

    print krr.score(Xtest, yref)
def learn(tuned_parameters,model):

	# produceFeature(trainfile)
	dataset = genfromtxt(open('Data/'+trainfile,'r'), delimiter=',',dtype='f8')[0:]
	target = [x[0] for x in dataset]
	train = [x[1:] for x in dataset]
	# print train[1:10]
	# print target
	# print len(train)

	# produceFeature(testfile)
	test = genfromtxt(open('Data/'+testfile,'r'),delimiter=',',dtype='f8')[0:]
	test_target = [x[1:] for x in test]


	# X, y = digits.data, digits.target
	trainnp = np.asarray(train)
	targetnp = np.asarray(target)


	# turn the data in a (samples, feature) matrix:
	X, y = trainnp, targetnp
	# X = digits.images.reshape((n_samples, -1))
	# y = digits.target

	# Split the dataset in two equal parts
	X_train, X_test, y_train, y_test = train_test_split(
	    X, y, test_size=0.5, random_state=0)



	scores = ['precision', 'recall']

	for score in scores:
	    print("# Tuning hyper-parameters for %s" % score)
	    print()

	    clf = GridSearchCV(model, tuned_parameters, cv=5,
	                       scoring='%s_weighted' % score)
	    clf.fit(X_train, y_train)

	    print("Best parameters set found on development set:")
	    print()
	    print(clf.best_params_)
	    print()
	    print("Grid scores on development set:")
	    print()
	    for params, mean_score, scores in clf.grid_scores_:
	        print("%0.3f (+/-%0.03f) for %r"
	              % (mean_score, scores.std() * 2, params))
	    print()

	    print("Detailed classification report:")
	    print()
	    print("The model is trained on the full development set.")
	    print("The scores are computed on the full evaluation set.")
	    print()
	    y_true, y_pred = y_test, clf.predict(X_test)
	    print(classification_report(y_true, y_pred))
	    print()
Example #6
0
def train_classifier(data, labels):
    
    nIter = 50
    alphaVals = [10**i for i in range(3,5)]
    params = { "loss": ["log"],
        "penalty": ['l1', 'l2'],
        "n_iter": [nIter],
        "alpha": alphaVals
    }
    params_log = { 
        "penalty": ['l2'] ,
        "C": [10**i for i in range(-3,-1)]
    }
    #sgd = SGDClassifier()
    sgd = LogisticRegression()
    clf = GridSearchCV(sgd, params_log)
    #data = data.tocsr()[:, 0:13]
    train, val, t_labs, val_labs = train_test_split(data,labels, train_size=.2, random_state=44)
    s = time.time()
    clf.fit(train, t_labs)
    print "Elapsed Training Time for ", len(params_log['C']), 'regularization vals: ', time.time() - s
    print clf.best_params_ 
    

    print "The Validation Score: ", clf.score(val, val_labs)
    probs =  clf.predict_proba(val)
    print "The log loss for the validation set is"
    print log_loss(probs[:,1], val_labs)
    return clf
Example #7
0
def test_ovo_gridsearch():
    ovo = OneVsOneClassifier(LinearSVC(random_state=0))
    Cs = [0.1, 0.5, 0.8]
    cv = GridSearchCV(ovo, {'estimator__C': Cs})
    cv.fit(iris.data, iris.target)
    best_C = cv.best_estimator_.estimators_[0].C
    assert_true(best_C in Cs)
Example #8
0
File: knn.py Project: kbai/uss
def Gridsearch_impl(X,Y,clf,param,cv):

    grid_search = GridSearchCV(clf,param,verbose=10,cv=cv,n_jobs=10)
    start = time()
    grid_search.fit(X,Y)
#    print(grid_search.grid_scores_)
    best = report(grid_search.grid_scores_)
def getOptCandGamma(cv_train, cv_label):
    print "Finding optimal C and gamma for SVM with RBF Kernel"
    C_range = 10.0 ** np.arange(-2, 9)
    gamma_range = 10.0 ** np.arange(-5, 4)
    param_grid = dict(gamma=gamma_range, C=C_range)
    cv = StratifiedKFold(y=cv_label, n_folds=40)

    # Use the svm.SVC() as the cost function to evaluate parameter choices
    # NOTE: Perhaps we should run computations in parallel if needed. Does it
    # do that already within the class?
    grid = GridSearchCV(svm.SVC(), param_grid=param_grid, cv=cv)
    grid.fit(cv_train, cv_label)

    score_dict = grid.grid_scores_
    scores = [x[1] for x in score_dict]
    scores = np.array(scores).reshape(len(C_range), len(gamma_range))
    pl.figure(figsize=(8,6))
    pl.subplots_adjust(left=0.05, right=0.95, bottom=0.15, top=0.95)
    pl.imshow(scores, interpolation='nearest', cmap=pl.cm.spectral)
    pl.xlabel('gamma')
    pl.ylabel('C')
    pl.colorbar()
    pl.xticks(np.arange(len(gamma_range)), gamma_range, rotation=45)
    pl.yticks(np.arange(len(C_range)), C_range)
    pl.show()

    print "The best classifier is: ", grid.best_estimator_
Example #10
0
def run_gridsearch(X, y, clf, param_grid, cv=5):
    """Run a grid search for best Decision Tree parameters.

    Args
    ----
    X -- features
    y -- targets (classes)
    cf -- scikit-learn Decision Tree
    param_grid -- [dict] parameter settings to test
    cv -- fold of cross-validation, default 5

    Returns
    -------
    top_params -- [dict] from report()
    """
    grid_search = GridSearchCV(clf,
                               param_grid=param_grid,
                               cv=cv,scoring = 'recall')
    start = time()
    grid_search.fit(X, y)

    print(("\nGridSearchCV took {:.2f} "
           "seconds for {:d} candidate "
           "parameter settings.").format(time() - start,
                len(grid_search.grid_scores_)))

    top_params = report(grid_search.grid_scores_, 3)
    return top_params
Example #11
0
def model_search(estimator, tuned_params, scores, X_train, y_train, X_test, y_test):  
    
    cv = ShuffleSplit(len(X_train), n_iter=3, test_size=0.30, random_state=0)

    for score in scores:
        print"# Tuning hyper-parameters for %s" % score
        print

        clf = GridSearchCV(estimator, tuned_params, cv=cv,
                           scoring='%s' % score)
        clf.fit(X_train, y_train)

        print"Best parameters set found on development set:"
        print
        print clf.best_params_
        print
        print "Grid scores on development set:"
        print
        for params, mean_score, scores in clf.grid_scores_:
            print "%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() * 2, params)
        print

        print "Detailed classification report:"
        print
        print "The model is trained on the full development set."
        print "The scores are computed on the full evaluation set."
        print
        y_true, y_pred = y_test, clf.predict(X_test)
        print classification_report(y_true, y_pred)
        print
Example #12
0
def grid_search(dataset_loader_train, model, grid_search):
    with timer(logger.info, "Loading data"):
        X, y = dataset_loader_train()

    grid_search_kwargs = {
        'refit': False,
        }
    grid_search_kwargs.update(grid_search)

    cv = grid_search_kwargs.get('cv', None)
    if callable(cv):
        grid_search_kwargs['cv'] = apply_kwargs(cv, n=len(y), y=y)

    if not (hasattr(model, 'score') or 'scoring' in grid_search_kwargs):
        raise ValueError(
            "Your model doesn't seem to implement a 'score' method.  You may "
            "want to pass a 'scoring' argument to 'grid_search' instead."
            )

    with timer(logger.info, "Running grid search"):
        gs = GridSearchCV(model, **grid_search_kwargs)
        gs.fit(X, y)

    scores = sorted(gs.grid_scores_, key=lambda x: -x.mean_validation_score)
    logger.info("\n{}".format(pformat(scores)))
    return scores
Example #13
0
def dogridsearch(X,Y,param_space,clf,cv):
    grid_search = GridSearchCV(clf,param_space,verbose=10l,cv=cv,n_jobs=-1)
    start = time()
    grid_search.fit(X,Y)
    print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
          % (time() - start, len(grid_search.grid_scores_)))
    best = report(grid_search.grid_scores_)
Example #14
0
def test_nntools_functional_grid_search(mnist, monkeypatch):
    # Make sure that we can satisfy the grid search interface.
    from nolearn.nntools import NeuralNet

    nn = NeuralNet(
        layers=[],
        X_tensor_type=T.matrix,
        )

    param_grid = {
        'more_params': [{'hidden_num_units': 100}, {'hidden_num_units': 200}],
        'update_momentum': [0.9, 0.98],
        }
    X, y = mnist

    vars_hist = []

    def fit(self, X, y):
        vars_hist.append(vars(self).copy())
        return self

    with patch.object(NeuralNet, 'fit', autospec=True) as mock_fit:
        mock_fit.side_effect = fit
        with patch('nolearn.nntools.NeuralNet.score') as score:
            score.return_value = 0.3
            gs = GridSearchCV(nn, param_grid, cv=2, refit=False, verbose=4)
            gs.fit(X, y)

    assert [entry['update_momentum'] for entry in vars_hist] == [
        0.9, 0.9, 0.98, 0.98] * 2
    assert [entry['more_params'] for entry in vars_hist] == (
        [{'hidden_num_units': 100}] * 4 +
        [{'hidden_num_units': 200}] * 4
        )
    def getGridSearch(self):
        # Set the search parameters
        parameters = {'vect__ngram_range': [(1,1),(1,2)], # Try either words or bi grams
                    'vect__max_df': (0.5, 0.1, 0.09),
                    #'vect__max_features': (None, 5000, 10000, 50000),
                    'tfidf__use_idf': (True, False),
                    'tfidf__norm': ('l1', 'l2'),
                    'clf__penalty': ('l2', 'elasticnet', 'l1'), # Default l2
					'clf__alpha': (0.0001, 0.0009), # Default 0.0001
					#'clf_fit_intercept': (True, False), # Default True
                    'clf__n_iter': (5, 50, 25), # Default 1 or 5 depending, optional
					#'clf__random_state':(0, 42), # Default None
                    'clf__epsilon':(0.01, 0.005)} # Default 0.01, depends on classifier (loss)
        # Use all cores to create a grid search
        classifierGS = GridSearchCV(self.pipeline, parameters, n_jobs=-1)
        # Fit the CS estimator for use as a classifier
        classifierGS = classifierGS.fit(self.tweets, self.labels)
        # Get the scores using the GS classifier
        bestParam, score, _ = max(classifierGS.grid_scores_, key=lambda x: x[1])
        # Print the parameter values
        for param_name in sorted(parameters.keys()):
            print("%s: %r" % (param_name,bestParam[param_name]))
        # Print the classifier score
        print("Classifier score: " + str(score) + "\n")
        # End of func return statement
        return
Example #16
0
    def score_nestedCV(self, G1, model, param_grid, effect, nested):
        k_fold = cross_validation.KFold(n=self.Y.shape[0], n_folds=self.n_folds, indices=True)
        i_fold=0
        scores = sp.zeros(self.n_folds)
        params = list()

        for train, test in k_fold:
            (trainData, trainY) = self._packData(G1, train, effect)
            (testData, testY) = self._packData(G1, test, effect)

            if nested:
                clf = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs = self.n_jobs_grid,
                                   cv=self.n_folds_params, scoring=self.scoring, verbose=self.verbose)

                clf.fit(trainData, trainY.flatten())

                params.append(clf.best_params_)

                scores[i_fold] = clf.score(testData, testY.flatten(), method_scorer=False)
            else:

                model.fit(trainData, trainY.flatten())
                scores[i_fold] = SCORERS[self.scoring](model, testData, testY.flatten())
            i_fold+=1

        return scores,params
def estimateParameters(X_train, X_test, y_train, y_test):

    tuned_parameters = [{'kernel': ['rbf'], \
                         'gamma': [1e-3, 1e-4], \
                         'C': [1, 10, 100, 1000]}, \
                        {'kernel': ['linear'], \
                         'C': [1, 10, 100, 1000]}]

    scores = ['precision', 'recall']
    for score in scores:

        print("# Tuning hyper-parameters for %s\n" % score)

        clf = GridSearchCV(SVC(C=1), tuned_parameters, cv=5, scoring=score)
        clf.fit(X_train, y_train)

        print("Best parameters set found on development set:\n")
        print(clf.best_estimator_)

        print("\nGrid scores on development set:\n")
        for params, mean_score, scores in clf.grid_scores_:
            print("%.3f (+/-%.03f) for %r" % (mean_score, scores.std() / 2, params))

        print("\nDetailed classification report:")
        print("The model is trained on the full development set.")
        print("The scores are computed on the full evaluation set.")
        y_true, y_pred = y_test, clf.predict(X_test)
        print(classification_report(y_true, y_pred))
        print()
def make_grid_search(pipeline, parameters, model_name, params):
    print model_name
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=4, verbose=3,
                               #loss_func=f1_score,
                               scoring="f1",
                               iid=False,
                               refit=True)
    #model_name = "ExtraTree_min_sample2_10trees_gridcv_desc_log"

    print("Performing grid search...")
    print("pipeline:", pipeline) # [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    grid_search.fit(features, salaries_enc)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_params_
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

    best_estimator = pipeline.set_params(**best_parameters)
    params = params + " ", grid_search.cv_scores_
    dio.save_model(best_estimator, model_name, mae_cv=grid_search.best_score_, parameters=params)
    print grid_search.cv_scores_
    prediction = grid_search.predict(validation_features)
    dio.save_prediction(model_name, prediction, "valid_classes")
Example #19
0
def separable_demo():
    """ Generate a linearly-separable dataset D, train a linear SVM on
    D, then output the resulting decision boundary on a figure.
    """
    from sklearn.datasets import make_blobs
    X, y = make_blobs(n_samples=200, n_features=2, 
                      centers=((0,0), (4, 4)),
                      cluster_std=1.0)
    plot_data(X, y)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
    svc = svm.SVC(class_weight='auto')
    param_grid = {'kernel': ['linear'],
                  'C': [1e0, 1e1, 1e2, 1e3, 1e4]}
    strat_2fold = StratifiedKFold(y_train, k=2)
    print "    Parameters to be chosen through cross validation:"
    for name, vals in param_grid.iteritems():
        if name != 'kernel':
            print "        {0}: {1}".format(name, vals)
    clf = GridSearchCV(svc, param_grid, n_jobs=1, cv=strat_2fold)
    clf.fit(X_train, y_train)
    print "== Best Parameters:", clf.best_params_
    y_pred = clf.predict(X_test)
    acc = len(np.where(y_pred == y_test)[0]) / float(len(y_pred))
    print "== Accuracy:", acc
    print classification_report(y_test, y_pred)
    plot_svm(clf.best_estimator_, X, y, X_test, y_test, 
             title="SVM Decision Boundary, Linear Kernel ({0} accuracy, C={1})".format(acc, clf.best_params_['C']))
Example #20
0
def classification_level_RandForest_pipeline(classifications_DF):
   X = classifications_DF.iloc[:,3:89]
   #assign the target (session length) to y and convert to int
   y_actual = classifications_DF.iloc[:,2:3].astype(float)

   #scaling the data for feature selection
   X_scaled = preprocessing.scale(X)

   X_scaled_train, X_scaled_test, y_actual_train, y_actual_test = train_test_split(X_scaled, y_actual, test_size=0.3, random_state=0)

 # Maybe some original features where good, too?
   selectKbest = SelectKBest(k=1,score_func=f_regression)

   # Build estimator from PCA and Univariate selection:
   X_features = selectKbest.fit(X_scaled_train,y_actual_train).transform(X_scaled_train)
   
   randomForestReg = RandomForestRegressor(n_estimators=1, criterion='mse')

   # Do grid search over k, n_components and SVR parameters:
   pipeline = Pipeline([('selectKbest', selectKbest),('randomForestReg',randomForestReg)])

   tuned_params = dict(selectKbest__k=[5,10,20,30,40,50,80],
                       randomForestReg__n_estimators=[1,2,4,8,16,32,64],
                       randomForestReg__min_samples_split=[2,3,5,10,20])

   grid_search = GridSearchCV(pipeline, param_grid=tuned_params,scoring='mean_squared_error',cv=3,verbose=10)
   grid_search.fit(X_scaled_train, y_actual_train['session_length'].values)
   print(grid_search.best_estimator_)
   y_true, y_pred = y_actual_test['session_length'].values,grid_search.best_estimator_.predict(X_scaled_test)
   print "Mean squared error:"+str(mean_squared_error(y_true,y_pred))
   pd.DataFrame(y_true, y_pred).to_csv("randomForestReg_pred_true.csv")
Example #21
0
def classification_level_SGDReg_pipeline(classifications_DF):
   X = classifications_DF.iloc[:,3:89]
   #assign the target (session length) to y and convert to int
   y_actual = classifications_DF.iloc[:,2:3].astype(float)

   #scaling the data for feature selection
   X_scaled = preprocessing.scale(X)

   X_scaled_train, X_scaled_test, y_actual_train, y_actual_test = train_test_split(X_scaled, y_actual, test_size=0.5, random_state=0)

   pca_selection = PCA(n_components=2)

   X_features = pca_selection.fit(X_scaled_train['session_length'].values).transform(X_scaled_train)

   SGDReg = SGDRegressor(alpha=0.0001)

   # Do grid search over k, n_components and SVR parameters:
   pipeline = Pipeline([('pca', pca_selection),('SGDReg',SGDReg)])

   tuned_params = dict(pca__n_components=[5,30,40,50],
                     SGDReg__alpha=[0.1,0.01,0.001,0.0001,0.00001],
                     SGDReg__l1_ratio=[.05, .15, .5, .7, .9, .95, .99, 1],
                     SGDReg__penalty=['l2','l1','elasticnet'])

   grid_search = GridSearchCV(pipeline, param_grid=tuned_params,scoring='mean_squared_error',cv=3,verbose=10)
   grid_search.fit(X_scaled_train, y_actual_train['session_length'].values)
   print(grid_search.best_estimator_)
   y_true, y_pred = y_actual_test['session_length'].values,grid_search.best_estimator_.predict(X_scaled_test)
   print "Mean squared error:"+str(mean_squared_error(y_true,y_pred))
   pd.DataFrame(y_true, y_pred).to_csv("SGDReg_pred_true.csv")
Example #22
0
def grid_search(X, y):
    '''
    cross validated grid search using Ridge Regressor and Random
    Forest Regressor
    '''

    nids = df_subset.index
    titles = df_subset['title']

    pars = {'alpha': [0.8, 0.6, 0.5, 0.45, 0.4, 0.2, 0.1,
                      0.08, 0.07, 0.06, 0.05, 0.04, 0.03, 0.02]}

    gs = GridSearchCV(Ridge(), pars, cv=5)
    gs.fit(X, y)

    ridge = gs.best_estimator_
    dill.dump(ridge, open('ridge.pkl', 'wb'))

    pars = {'max_depth': [5, 8, 10, 20, 50, 100],
            'min_samples_split': [2, 3, 5, 10, 20]}

    gs = GridSearchCV(RFR(n_estimators=100, random_state=42, n_jobs=2),
                      pars, cv=5)
    rfr = gs.best_estimator_
    dill.dump(rfr, open('rfr.pkl', 'wb'))
    return ridge, rfr
Example #23
0
def run_random_forest(training_features, training_labels, test_features, test_labels, passed_parameters=None):

    estimator = ensemble.RandomForestRegressor(random_state=0, n_estimators=25)

    # set up parameters for the classifier
    if passed_parameters == None:
        parameters = {"max_depth": None}
    else:
        parameters = passed_parameters

    # create cross validation iterator
    cv = ShuffleSplit(training_features.shape[0], n_iter=5, test_size=0.2, random_state=0)

    # set up tuning algorithm
    regressor = GridSearchCV(estimator=estimator, cv=cv, param_grid=parameters)

    # fit the classifier
    regressor.fit(training_features, training_labels)

    test_prediction = regressor.predict(test_features)
    test_accuracy = regressor.score(test_features, test_labels)

    time_2 = time.time()

    return test_prediction, test_accuracy
Example #24
0
    def run_linear_open_experiment(self, iterations=10, save=False):
        """
        Train a classifier on test data, obtain the best combination of
        parameters through a grid search cross-validation and test the
        classifier using a open-world split of the dataset. The results
        from the number of iterations are saved as pz files.

        :param iterations: number of runs (training/testing)
        :save: save predictions and labels if True
        """
        self.true_labels = np.array([])
        self.predictions = np.array([])
        for i in xrange(iterations):
            self.randomize_dataset_open_world()
            clf = GridSearchCV(svm.LinearSVC(), {'C': np.logspace(-3, 3, 7)})
            clf.fit(self.X_train, self.Y_train)
            out = clf.best_estimator_.decision_function(self.X_test)
            classes = clf.best_estimator_.classes_
            for scores in out:
                m = np.max(scores)
                if (abs(m/scores[:][:]) < 0.5).any():
                    self.predictions = np.append(self.predictions, 99)
                else:
                    p = classes[np.where(scores==m)]
                    self.predictions = np.append(self.predictions, p)
            self.true_labels = np.append(self.true_labels, self.Y_test)

        if save:
            pz.save(self.predictions, "mca_predictions_open.pz")
            pz.save(self.true_labels, "mca_true_labels_open.pz")
def fit_predict_model(city_data):
    """Find and tune the optimal model. Make a prediction on housing data."""

    # Get the features and labels from the Boston housing data
    X, y = city_data.data, city_data.target

    # Setup a Decision Tree Regressor
    regressor = DecisionTreeRegressor()

    parameters = {'max_depth':(1,2,3,4,5,6,7,8,9,10)}

    ###################################
    ### Step 4. YOUR CODE GOES HERE ###
    ###################################

    # 1. Find the best performance metric
    # should be the same as your performance_metric procedure
    # http://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html
    dtr_scorer = make_scorer(mean_squared_error, greater_is_better=False)

    # 2. Use gridearch to fine tune the Decision Tree Regressor and find the best model
    # http://scikit-learn.org/stable/modules/generated/sklearn.grid_search.GridSearchCV.html#sklearn.grid_search.GridSearchCV
    reg = GridSearchCV(regressor, parameters, scoring=dtr_scorer, cv=6)

    # Fit the learner to the training data
    print "Final Model: "
    print reg.fit(X, y)
    print "Best estimator choosen by GridSearchCV: ", reg.best_estimator_
    
    # Use the model to predict the output of a particular sample
    x = [11.95, 0.00, 18.100, 0, 0.6590, 5.6090, 90.00, 1.385, 24, 680.0, 20.20, 332.09, 12.13]
    y = reg.predict(x)
    print "House: " + str(x)
    print "Prediction: " + str(y)
def grid_search_model(clf_factory, X, Y,save_file="read/best_param.txt"):
    u"""最適なパラメータを調べる
        Args:
            clf_factory:機械学習モデル
            X:特徴量
            Y:ラベル
        Returns:
            clf:最も良かったモデル
    """
    stopwords=load_stopwords_old()
    cv = ShuffleSplit(
        n=len(X), n_iter=10, test_size=0.3, random_state=0)
    param_grid = dict(vect__ngram_range=[(1, 1), (1, 2), (1, 3)],
                      vect__min_df=[1, 2],
                      vect__stop_words=[None, stopwords],
                      vect__smooth_idf=[False, True],
                      vect__use_idf=[False, True],
                      vect__sublinear_tf=[False, True],
                      vect__binary=[False, True],
                      )
    grid_search = GridSearchCV(clf_factory(),
                              param_grid=param_grid,
                              cv=cv,
                              verbose=10)
    grid_search.fit(X, Y)
    clf = grid_search.best_estimator_
    write_to_text(grid_search.best_params_,save_file)

    return clf
Example #27
0
    def run_linear_experiment(self, rocs_filename, iterations=10):
        """
        Run a classification experiment by running several iterations.
        In each iteration data is randomized, a linear svm classifier
        is trained and evaluated using cross-validation over a the 
        cost parameter in the range np.logspace(-3, 3, 7). The best
        classifier is used for testing and a ROC curve is computed
        and saved as property and locally.

        :param rocs_filename: the file to save all rocs computed
        :param iterations: number of runs (training/testing)
        """
        for i in xrange(iterations):
            print "[*] Iteration {0}".format(i)
            print "[*] Randomizing dataset..."
            self.randomize_dataset()
            clf = GridSearchCV(svm.LinearSVC(), {'C': np.logspace(-3, 3, 7)})
            print "[*] Training..."
            clf.fit(self.X_train, self.Y_train)
            out = clf.best_estimator_.decision_function(self.X_test)
            print "[*] Testing..."
            roc = eval.compute_roc(np.float32(out.flatten()),
                                   np.float32(self.Y_test))
            self.rocs.append(roc)
            print "[*] ROC saved."
        pz.save(self.rocs, rocs_filename)
Example #28
0
def run_support_vector_regressor(
    training_features, training_labels, test_features, test_labels, passed_parameters=None
):

    estimator = svm.SVR()

    # set up parameters for the classifier
    if passed_parameters == None:
        parameters = {"kernel": ["linear"]}
    else:
        parameters = passed_parameters

    # create cross validation iterator
    cv = ShuffleSplit(training_features.shape[0], n_iter=5, test_size=0.2, random_state=0)

    # set up tuning algorithm
    regressor = GridSearchCV(estimator=estimator, cv=cv, param_grid=parameters)

    # fit the classifier
    regressor.fit(training_features, training_labels)

    test_prediction = regressor.predict(test_features)
    test_accuracy = regressor.score(test_features, test_labels)

    time_2 = time.time()

    return test_prediction, test_accuracy
Example #29
0
def test_krr_regP():
    
    dim = 5
    n = 1000
    ntest = 1001    

    pref = np.random.random(size=dim) - 0.5

    #pref /= np.sqrt(pref.dot(pref))

    Xtrain = np.random.random((n, dim)) + 1.0
    ytrain = Xtrain.dot(pref) + np.random.normal(scale=0.05, size=n) + 10.0

    Xtest = np.random.random((ntest, dim)) + 1.0
    yref = Xtest.dot(pref) + 10.0

    krr = kRidgeRegression(kernel=Linear(), eta=1.0)

    gs = GridSearchCV(krr, {'eta' : [0, 1E-16, 1E-14, 1E-12, 1E-10, 1E-8, 1E-6, 1E-4, 1E-2, 1]})

    gs.fit(Xtrain, ytrain)

    krr = gs.best_estimator_

    ytest = krr.transform(Xtest).flatten()
    print krr.beta.shape
    print krr.Ku.shape

    print krr.score(Xtest, yref)
Example #30
0
def MyGridSearch(X,y):
    kfold = cross_validation.KFold(len(X), 5)
    for train, test in kfold:
    	#X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.5, random_state = 0)
    	#parameters = {'kernel': ('linear', 'rbf'), 'C':[1.5, 10]}
    	#parameters = {'kernel': ['rbf'], 'gamma': [1e-1,1e-2,1e-3,1e-4,1e-5,1e-6,1e-7,1e-8,1e-9 ], 'epsilon' : [0.1],
    	#                 'C': [1, 5, 10, 50,100,500,1000,5000,10000]}
    	#parameters = {'kernel': ['poly'], 'gamma': [1e-2,1e-3,1e-4 ], 'epsilon' : [0.1],'degree':[3],
    	#                 'C': [ 50,100,500,1000]}
    	parameters = {'kernel': ['rbf'], 'gamma': [1e-5], 'epsilon' : [0.2],
                     'C': [100000]}
        #parameters = [{'C': sp.stats.expon(scale=100), 'gamma': sp.stats.expon(scale=.1),
        #             'kernel': ['rbf'], 'class_weight':['auto', None]}]
    	model = svm.SVR()

    	grid = GridSearchCV(model,parameters)
    	#grid = RandomizedSearchCV(model,parameters)
    	grid.fit(X[train], y[train])
	#print grid
    	predictions = grid.predict(X[test])
	print grid.best_score_
	if grid.best_score_ > 0.98:
		return grid
		break
   	#print grid.best_estimator_.coef_
    return grid
Example #31
0
#use PCA to lower the dimensionality
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
pca = PCA(n_components=150, whiten=True, random_state=42, svd_solver = 'randomized')
svc = SVC(kernel='rbf', class_weight='balanced')
model = make_pipeline(pca, svc)

#split the data
from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(faces.data, faces.target, random_state=42)

#grid search - CV to emplore parameters C (margin of hardness) and gamma (size of radial basis function kernel)
from sklearn.grid_search import GridSearchCV
param_grid = {'svc__C': [1, 5, 10, 50], 'svc__gamma': [0.0001, 0.0005, 0.001, 0.005]}
grid = GridSearchCV(model, param_grid)

#run the search and time it
%time grid.fit(Xtrain, ytrain)
print(grid.best_params_)

#predict
model = grid.best_estimator_
yfit = model.predict(Xtest)

fig, ax = plt.subplots(4, 6)
for i, axi in enumerate(ax.flat):
    axi.imshow(Xtest[i].reshape(62, 47), cmap='bone')
    axi.set(xticks=[], yticks=[])
    axi.set_ylabel(faces.target_names[yfit[i]].split()[-1],
    color='black' if yfit[i] == ytest[i] else 'red')
Example #32
0
def cv_optimize(clf, parameters, Xtrain, ytrain, n_folds=5):
    gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds)
    gs.fit(Xtrain, ytrain)
    print("BEST PARAMS", gs.best_params_)
    best = gs.best_estimator_
    return best
Example #33
0
# gsearch4 = GridSearchCV(estimator = estimator, param_grid = param_test4,n_jobs=1,iid=False, cv=5)
# gsearch4.fit(df_train[predictors],df_train[targetname])
# print(gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_)

# subsample = 0.8
param_test5 = {'subsample': [0.8, 0.85, 0.9, 0.95, 0.1]}
estimator = GradientBoostingClassifier(n_estimators=50,
                                       learning_rate=0.1,
                                       min_samples_split=1,
                                       max_depth=5,
                                       min_samples_leaf=50,
                                       random_state=10,
                                       max_features=4)
gsearch5 = GridSearchCV(estimator=estimator,
                        param_grid=param_test5,
                        n_jobs=1,
                        iid=False,
                        cv=5)
gsearch5.fit(df_train[predictors], df_train[targetname])
print(gsearch5.grid_scores_, gsearch5.best_params_, gsearch5.best_score_)

# proporcionalne znizim learning rate a zvysim pocet stromov - vyzera ze najlepsie je klasika povodny

##gbm_tuned_0 = GradientBoostingClassifier(learning_rate=0.1, n_estimators=50,max_depth=5, min_samples_split=1,min_samples_leaf=50, subsample=0.8, random_state=10, max_features=4)
##modelfit(gbm_tuned_0, df_train, predictors, targetname, performCV=True, printFeatureImportance=True, cv_folds=5)

##gbm_tuned_1 = GradientBoostingClassifier(learning_rate=0.05, n_estimators=100,max_depth=5, min_samples_split=1,min_samples_leaf=50, subsample=0.8, random_state=10, max_features=4)
##modelfit(gbm_tuned_1, df_train, predictors, targetname, performCV=True, printFeatureImportance=True, cv_folds=5)

##gbm_tuned_2 = GradientBoostingClassifier(learning_rate=0.2, n_estimators=25,max_depth=5, min_samples_split=1,min_samples_leaf=50, subsample=0.8, random_state=10, max_features=4)
##modelfit(gbm_tuned_2, df_train, predictors, targetname, performCV=True, printFeatureImportance=True, cv_folds=5)
Example #34
0
t0 = time()
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)
print "done in %0.3fs" % (time() - t0)

###############################################################################
# Train a SVM classification model

print "Fitting the classifier to the training set"
t0 = time()
param_grid = {
    'C': [1e3, 5e3, 1e4, 5e4, 1e5],
    'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
}
# for sklearn version 0.16 or prior, the class_weight parameter value is 'auto'
clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid)
clf = clf.fit(X_train_pca, y_train)
print "done in %0.3fs" % (time() - t0)
print "Best estimator found by grid search:"
print clf.best_estimator_

###############################################################################
# Quantitative evaluation of the model quality on the test set

print "Predicting the people names on the testing set"
t0 = time()
y_pred = clf.predict(X_test_pca)
print "done in %0.3fs" % (time() - t0)

print classification_report(y_test, y_pred, target_names=target_names)
print confusion_matrix(y_test, y_pred, labels=range(n_classes))
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV

iris = sb.load_dataset('iris')
iris = pd.DataFrame(iris)
sb.pairplot(iris)
plt.show()
sb.jointplot(x='sepal_width',
             y='sepal_length',
             data=iris,
             kind='kde',
             color='red')
plt.show()
X = iris.drop('species', axis=1)
y = iris['species']
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=101)
plot1 = SVC()
plot1.fit(X_train, y_train)
pred1 = plot1.predict(X_test)
print(classification_report(y_test, pred1))
dic = {'C': [0.1, 1, 10, 100, 1000], 'gamma': [1000, 100, 10, 1, 0.1]}
plot2 = GridSearchCV(SVC(), dic, verbose=3)
plot2.fit(X_train, y_train)
pred2 = plot2.predict(X_test)
print(classification_report(y_test, pred2))
Example #36
0
    'kernel': ['poly'],
    'degree': [3]
}, {
    'C': [1, 10, 100, 1000],
    'gamma': [0.001, 0.0001],
    'kernel': ['poly'],
    'degree': [4]
}]

scores = ['precision']  # you can alter this by adding, for example, `recall'

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(SVC(C=1), tuned_parameters)
    clf.fit(x_train, t_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_estimator_)
    print()
    print("Grid scores on development set:")
    print()
    for params, mean_score, scores in clf.grid_scores_:
        print("%0.3f (+/-%0.03f) for %s" %
              (mean_score, scores.std() / 2, params))
    print()

print(time.time() - start_time)
Example #37
0
run_gs = False

if run_gs:
    parameter_grid = {
        'n_estimators': [100, 200, 300, 400, 500],
        'max_depth': range(1, 9, 2),
        'learning_rate': [0.0001, 0.001, 0.01, 0.1],
        'subsample': [0.5, 0.6, 0.7, 0.8, 1.0],
        'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 1.0],
        'colsample_bylevel': [0.5, 0.6, 0.7, 0.8, 1.0],
    }
    xgb = XGBClassifier()
    cross_validation = StratifiedKFold(targets, n_folds=5)

    grid_search = GridSearchCV(xgb,
                               scoring='accuracy',
                               param_grid=parameter_grid,
                               cv=cross_validation)

    grid_search.fit(train, targets)
    model = grid_search
    parameters = grid_search.best_params_

    print('Best score: {}'.format(grid_search.best_score_))
    print('Best parameters: {}'.format(grid_search.best_params_))
else:
    parameters = {'n_estimators': 100, 'max_depth': 3, 'learning_rate': 0.1,
                  'subsample': 0.6, 'colsample_bytree': 1.0, 'colsample_bylevel': 0.5}

    seed = 7
    test_size = 0.33
    X_train, X_test, y_train, y_test = train_test_split(train, targets, test_size=test_size, random_state=seed)
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV

# ## The Data
# [Iris flower data set](http://en.wikipedia.org/wiki/Iris_flower_data_set).
# The data set consists of 50 samples from each of three species of Iris (Iris setosa, Iris virginica and Iris versicolor), so 150 total samples. Four features were measured from each sample: the length and the width of the sepals and petals, in centimeters.

iris = sns.load_dataset('iris')

## # Model Selection
X = iris.drop('species', axis=1)
y = iris['species']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
svc_model = SVC()
svc_model.fit(X_train, y_train)

# ## Model Evaluation
pred = svc_model.predict(X_test)
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))

#Using Gridsearch for better SVM parameters
param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001]}
grid = GridSearchCV(SVC(), param_grid, verbose=2)

grid.fit(X_train, y_train)

grid_predictions = grid.predict(X_test)
print(confusion_matrix(y_test, grid_predictions))
print(classification_report(y_test, grid_predictions))
test_classifier(LogisticRegression(), imputed_data, features_list, folds=1000)

#Note that for decision trees and random forests, the results are always different
#This is because different partitions of decision boundaries are used

#answer question from Understanding the dataset and question
#Try gridesearchcv and answer why parameter tuning is important
#addresses why validation is important, talk about precision and recall
from sklearn.svm import SVC

#Tuning SVM
print "\n Support vector machines"
parameters = {'C': [0.001, 0.01, 0.1, 1, 10], 'gamma': [0.001, 0.01, 0.1, 1]}
sv = SVC(kernel='rbf')
#cv = StratifiedShuffleSplit(test_size = 0.2, random_state = 42)
clf_svc = GridSearchCV(sv, parameters, scoring="f1")
fit_classify_scaled(clf_svc, list_all_features)
#clf_svc.fit(features_train, labels_train)
sv_params = clf_svc.best_params_
print sv_params

#Going through the StratifiedShuffleSplit and then GridSearchCV did not give better results
#clf2 = GridSearchCV(sv, parameters, scoring = "f1")
#test_classifier(clf2, imputed_data, features_list, folds = 1000)

#Tuning Decision trees
from sklearn.tree import DecisionTreeClassifier
print "\n Tuning Decision Trees"
parameters = {
    'max_depth': [None, 10, 5, 2],
    'min_samples_split': [2, 10, 5],
clf = clf.fit(features_train, labels_train)
from tester import test_classifier
test_classifier(clf, data_dict, features_list)

# ### Tuning by GridSearchCV 函数

# In[24]:

from sklearn.grid_search import GridSearchCV
t0 = time()
param_grid = {
    'min_samples_split': [2, 3, 4, 5, 6, 7, 8],
    'max_depth': [1, 2, 3, 4, 5, 6, 7, 8],
    'max_features': range(3, 7)
}
clf = GridSearchCV(DecisionTreeClassifier(), param_grid, scoring='f1')
clf = clf.fit(features_train, labels_train)
print "done in %0.3fs" % (time() - t0)
print "Best estimator found by grid search:"
clf = clf.best_estimator_
from tester import test_classifier
test_classifier(clf, data_dict, features_list)

# ### 使用GridSearchCV进行参数调整,通过多次运行之后,确定的参数为 class_weight=None, criterion='gini', max_depth=7,max_features=3, max_leaf_nodes=None, min_samples_leaf=1,min_samples_split=2, min_weight_fraction_leaf=0.0,presort=False, random_state=None, splitter='best'  将最后的DT算法中的参数调整为GridSearchCV所得到的结果
#

# In[25]:

clf = DecisionTreeClassifier(class_weight=None,
                             criterion='gini',
                             max_depth=7,
Example #41
0
batch_size = [j for j in range(2,30,2)]
epochs = [i for i in range(1,10,1)]
param_grid = dict(batch_size=batch_size, nb_epoch=epochs)'''
neurons = [i for i in range(1, 151, 10)]
param_grid = dict(neurons=neurons)
'''
optimizer = ['SGD', 'RMSprop', 'Adagrad', 'Adadelta', 'Adam', 'Adamax', 'Nadam']
param_grid = dict(optimizer=optimizer)'''
'''
init_mode = ['uniform', 'lecun_uniform', 'normal', 'zero', 'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform']
param_grid = dict(init_mode=init_mode)'''
'''activation = ['softmax', 'softplus', 'softsign', 'relu', 'tanh', 'sigmoid', 'hard_sigmoid', 'linear']
param_grid = dict(activation=activation)'''
'''weight_constraint = [1, 2, 3, 4, 5]
dropout_rate = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
param_grid = dict(dropout_rate=dropout_rate, weight_constraint=weight_constraint)'''
grid = GridSearchCV(estimator=model,
                    param_grid=param_grid,
                    n_jobs=-1,
                    scoring='accuracy',
                    cv=10)
grid_result = grid.fit(data, mark)
# summarize results
print("Best: %f using %s" %
      (grid_result.best_score_, grid_result.best_params_))
for params, mean_score, scores in grid_result.grid_scores_:
    print("%f (%f) with: %r" % (scores.mean(), scores.std(), params))
'''
model.fit(x_train, y_train, epochs=50, batch_size=10,verbose=0)
loss_and_metrics = model.evaluate(x_test, y_test,verbose=0)
print ('/n','----------',loss_and_metrics[1])'''
classesNameCol = 'ClassStr'
ratutils.populateClumpsWithClassTraining(outputClumps, classesDict, tmpPath,
                                         classesIntCol, classesNameCol)
rsgislib.classification.classratutils.balanceSampleTrainingRandom(
    outputClumps, classesIntCol, 'classesIntColBal', 50,
    5000)  # balance the training data
classesIntCol = 'classesIntColBal'

##############################################################################################
# use grid search to define the classifier
variables = [
    'VVMin', 'VHMin', 'VVdivVHMin', 'VVMax', 'VHMax', 'VVdivVHMax', 'VVAvg',
    'VHAvg', 'VVdivVHAvg', 'VVStd', 'VHStd', 'VVdivVHStd'
]
classParameters = {'n_estimators': [10, 100, 500], 'max_features': [2, 3, 4]}
gsearch = GridSearchCV(ExtraTreesClassifier(bootstrap=True), classParameters)
classifier = classratutils.findClassifierParameters(outputClumps,
                                                    classesIntCol,
                                                    variables,
                                                    preProcessor=None,
                                                    gridSearch=gsearch)

# define the output colours
classColours = dict()
classColours['Other'] = [212, 125, 83]
classColours['Water'] = [157, 212, 255]
classColours['VegWater'] = [191, 255, 0]
classColours['Unclassified'] = [0, 0, 0]

##############################################################################################
# run the classification
Example #43
0
    plt.ylabel('Feature Importance')
    plt.show()


clf = GradientBoostingClassifier(random_state=15)
print(hackathon_GBC_model(clf, train_data, features))

estimators = [x for x in range(10, 131, 10)]
first_tune = {'n_estimators': estimators}
first_search = GridSearchCV(estimator=GradientBoostingClassifier(
    learning_rate=0.05,
    min_samples_split=700,
    min_samples_leaf=70,
    max_depth=8,
    max_features='sqrt',
    subsample=0.8,
    random_state=15,
),
                            param_grid=first_tune,
                            scoring='roc_auc',
                            n_jobs=6,
                            iid=False,
                            cv=5)

first_search.fit(train_data[features], train_data["Class"])

print(first_search.grid_scores_, first_search.best_params_,
      first_search.best_score_)

min_split = [x for x in range(300, 1101, 100)]
depth = [x for x in range(5, 15, 1)]
second_tune = {'max_depth': depth, 'min_samples_split': min_split}
Example #44
0
def svmModel(filenameL, filenameU, output):
	tweets = []
	for line in open(filenameL, "r").readlines():
		tweet = json.loads(line)
		tweets.append([tweet[0], tweet[1].lower().strip()])
 
    # Extract the vocabulary of keywords
	vocab = dict()
	for class_label, text in tweets:
		for term in text.split():
			term = term.lower()
			if len(term) > 2 and term not in stopwords:
				if vocab.has_key(term):
					vocab[term] = vocab[term] + 1
				else:
					vocab[term] = 1
 
    # Remove terms whose frequencies are less than 15
	vocab = {term: freq for term, freq in vocab.items() if freq > 15}
    # Generate an id starting from 0 for each term in vocab
	vocab = {term: idx for idx, (term, freq) in enumerate(vocab.items())}
	print vocab
 
    # Generate X and y
	X = []
	y = []
	for class_label, text in tweets:
		x = [0] * len(vocab)
		terms = [term for term in text.split() if len(term) > 2]
		for term in terms:
			if vocab.has_key(term):
				x[vocab[term]] += 1
		y.append(class_label)
		X.append(x)
 
    # 10 folder cross validation to estimate the best w and b
	svc = svm.SVC(kernel='linear')
	Cs = range(1, 20)
	clf = GridSearchCV(estimator=svc, param_grid=dict(C=Cs), cv = 10)
	clf.fit(X, y)
	
# predict the class labels of new tweets
	print clf.predict(X)
	tweets = []
	for line in open(filenameU).readlines():
		tweets.append(line)
 
# Generate X for testing tweets
	X = []
	for text in tweets:
		x = [0] * len(vocab)
		terms = [term for term in text.split() if len(term) > 2]
		for term in terms:
			if vocab.has_key(term):
				x[vocab[term]] += 1
		X.append(x)
	y = clf.predict(X)
	
    #write all positive tweets to the output file
	f = open(output, "a")
	for idx in range(0, len(tweets)):
		if(y[idx] == 1):
			print 'Sentiment Class (1 means positive; 0 means negative): ', y[idx]
			print 'TEXT: ', idx, tweets[idx]
			labeledTweet = [y[idx], json.loads(tweets[idx])]
			f.write(labeledTweet)
    if best_err is None or best_err > val_err:
        best_l2 = l2
        best_err = val_err
print 'best ' + str(best_l2) + ' and ' + str(best_err)
plt.plot(l2_penalty[2:12], total_val[2:12], 'k-')
plt.xlabel('$\L2_penalty$')
plt.ylabel('K-fold cross validation error')
plt.xscale('log')
plt.yscale('log')

model_python = RidgeCV(l2_penalty)
model_python.fit(data15[my_features], data15['price'])
'''
rss=[]
for alpha in alphas:
    model=Ridge(alpha)
    model.fit(training[my_features],training['price'])
    pred=model.predict(testing)
    val_err=sum((pred-training['price'])**2)
    rss.append(val_err)

plt.plot(alphas,rss,'k-')
plt.xlabel('$\L2_penalty$')
plt.ylabel('K-fold cross validation error')
plt.xscale('log')
plt.yscale('log')'''

tuned_parameters = {'alpha': l2_penalty}
model2 = GridSearchCV(Ridge(), tuned_parameters, cv=10)
model2.fit(data15[my_features], data15['price'])
def my_svm():
    tweets = []
    for line in open('data.txt').readlines()[:500]:
        items = line.split(',')
        tweets.append([int(items[0]), items[1].lower().strip()])

    # Extract the vocabulary of keywords
    vocab = dict()
    for class_label, text in tweets:
        for term in text.split():
            term = term.lower()
            if len(term) > 2 and term not in stopwords:
                if vocab.has_key(term):
                    vocab[term] = vocab[term] + 1
                else:
                    vocab[term] = 1

    # Remove terms whose frequencies are less than a threshold (e.g., 10)
    vocab = {term: freq for term, freq in vocab.items() if freq > 10}
    # Generate an id (starting from 0) for each term in vocab
    vocab = {term: idx for idx, (term, freq) in enumerate(vocab.items())}
    print "******Features*******"
    print vocab

    # Generate X and y
    X = []
    y = []
    for class_label, text in tweets:
        x = [0] * len(vocab)
        terms = [term for term in text.split()]
        for term in terms:
            if vocab.has_key(term):
                x[vocab[term]] += 1
        y.append(class_label)
        X.append(x)


    print "The total number of training tweets: {} ({} positives, {}: negatives)".format(len(y), sum(y), len(y) - sum(y))


    # 10 folder cross validation to estimate the best w and b
    svc = svm.SVC(kernel='linear')
    Cs = range(1, 20)
    clf = GridSearchCV(estimator=svc, param_grid=dict(C=Cs), cv = 10)
    clf.fit(X, y)

    print "The estimated w: "
    print clf.best_estimator_.coef_

    print "The estimated b: "
    print clf.best_estimator_.intercept_

    print "The estimated C after the grid search for 10 fold cross validation: "
    print clf.best_params_

    print "Accuracy "
    print str(clf.best_score_)

    # predict the class labels of new tweets
    tweets = []
    for line in open('tJumanji.txt').readlines():
        tweets.append(line)

    # Generate X for testing tweets
    test_X = []
    for text in tweets:
        x = [0] * len(vocab)
        terms = [term for term in text.split() if len(term) > 2]
        for term in terms:
            if vocab.has_key(term):
                x[vocab[term]] += 1
        test_X.append(x)
    test_y = clf.predict(test_X)

    c = 0
    fl = open("p_Jumanji.txt","w")
    fl_n = open("n_Jumanji.txt","w")
    for text in tweets:
        if(test_y[c]==1):
            fl.write(text)
        else:
            fl_n.write(text)
        c+=1


    print "The total number of testing tweets: {} ({} are predicted as positives, {} are predicted as negatives)".format(len(test_y), sum(test_y), len(test_y) - sum(test_y))
y = y.flatten()
# x.shape=(150,4)   y.shape=(150,)
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    random_state=1,
                                                    train_size=0.6)

pipeline = Pipeline([('clf', SVC(kernel='rbf', gamma=0.01, C=100))])
parameters = {
    'clf__gamma': (0.01, 0.03, 0.1, 0.3, 1, 1.3),
    'clf__C': (0.1, 0.3, 1, 3, 10, 30, 40),
}

grid_search = GridSearchCV(pipeline,
                           parameters,
                           n_jobs=-1,
                           verbose=1,
                           scoring='accuracy',
                           refit=True)
grid_search.fit(x_train, y_train)

print('最佳效果:%0.3f' % grid_search.best_score_)
print('最优参数集:')
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print('\t%s: %r' % (param_name, best_parameters[param_name]))

predictions = grid_search.predict(x_test)
print("-------------------")
print(classification_report(y_test, predictions))

print(accuracy_score(y_test, predictions))
Example #48
0
n_train = len(y_train)
n_test = len(y_sts_val)

param_grid = LARGE_PARAM_GRID


def score_stub(true, pred):
    return correlation(true, postprocess(test_input_val, pred))


# grid cv on test held
cv_split = [(range(n_train), range(n_train, n_train + n_test))]
grid = GridSearchCV(SVR(),
                    param_grid,
                    cv=cv_split,
                    verbose=1,
                    n_jobs=N_JOBS,
                    scoring=make_scorer(correlation))
grid.fit(vstack([X_train, X_sts13_val]), hstack([y_train, y_sts_val]))

held_cv_score = grid.best_score_
held_cv_params = grid.best_params_

print held_cv_score
print held_cv_params

regressor = SVR(**held_cv_params)
regressor.fit(X_train, y_train)

y_test = regressor.predict(X_sts13_held)
# y_test = postprocess(test_input_held,  y_test)
Example #49
0
    def predict(self, X, y=None):
        try:
            getattr(self, "treshold_")
        except AttributeError:
            raise RuntimeError(
                "You must train classifer before predicting data!")

        return ([self._meaning(x) for x in X])

    def score(self, X, y=None):
        # counts number of values bigger than mean
        return (sum(self.predict(X)))


from sklearn.grid_search import GridSearchCV
from sklearn.utils.estimator_checks import check_estimator
check_estimator(MeanClassifier)  # passes
trainJZ = [i for i in range(0, 150, 5)]
testJZ = [i + 3 for i in range(-5, 5, 5)]
tuned_params = {"intValue": [-10, -1, 0, 1, 10]}

gs = GridSearchCV(MeanClassifier(), tuned_params)

# for some reason I have to pass y with same shape
# otherwise gridsearch throws an error. Not sure why.

y = [1 for i in range(20)]
gs.fit(trainJZ, y)

print gs.best_params_  # {'intValue': -10} # and that is what we expect :)
Example #50
0
def run():
    # Read dataset
    author_ids = []
    labels = []
    for line in open('learning/dataset/similarity_training_ids', 'r'):
        vals = map(lambda x: int(x), line.strip().split('\t'))
        author_ids.append((vals[0], vals[1]))
        labels.append(vals[2])

    all_papers_model = WordCount()
    all_papers_model.load('models/everything.pkl')
    contributors_model = WordCount()
    contributors_model.load('models/contributors.pkl')
    sc = SimilarityClassifier(languageModel=all_papers_model,
                              contributorsModel=contributors_model)

    author_data_cache = dict()

    def getAuthorData(sc, id):
        try:
            return author_data_cache[id]
        except KeyError:
            x = sc.getDataById(id)
            author_data_cache[id] = x
            return x

    recompute = False
    if recompute:
        print("Computing features")
        features = []
        for (idA, idB) in author_ids:
            f = sc.computeFeatures(getAuthorData(sc, idA),
                                   getAuthorData(sc, idB))
            features.append(f)
        print("Writing features back")
        outf = open('learning/dataset/author-features', 'w')
        for i in range(len(labels)):
            print('\t'.join(map(lambda x: str(x), features[i])), file=outf)

        outf.close()
    else:
        inf = open('learning/dataset/author-features', 'r')
        features = []
        for line in inf:
            f = map(lambda x: float(x), line.strip().split('\t'))
            features.append(f)
        inf.close()

    if True:

        def custom_scorer(estimator, X, y):
            conf = estimator.confusion(X, y)
            print(conf)
            return -conf[1][0]

        param_grid = [{
            'positiveSampleWeight': [0.001, 0.005, 0.01, 0.02, 0.05]
        }]
        cv = GridSearchCV(sc,
                          param_grid,
                          cv=5,
                          scoring=custom_scorer,
                          iid=False)
        cv.fit(features, labels)
        print("Grid scores:")
        print(cv.grid_scores_)

        sc = cv.best_estimator_

        def paper_url(pk):
            print('http://beta.ens.dissem.in/paper/' +
                  str(Author.objects.get(pk=pk).paper_id))

        print("Curious papers")
        pubSc = sc.simFeatures[1]
        for i in range(len(labels)):
            prediction = sc.classifier.predict(features[i])[0]
            if labels[i] == 0 and prediction == 1:
                print("#####")
                paper_url(author_ids[i][0])
                paper_url(author_ids[i][1])
                #print("Explanation")
                #pubSc.compute(Author.objects.get(pk=author_ids[i][0]),
                #        Author.objects.get(pk=author_ids[i][1]), explain=True)

        print(sc.confusion(features, labels))
        #sc.plotClassification(features, labels)
        sc.save('models/similarity.pkl')
Example #51
0
def run_k_nearest_neighbors(training_features,
                            training_labels,
                            test_features,
                            test_labels,
                            passed_parameters=None):
    """
    Classifies the data using sklearn's k nearest neighbors classifier

    Parameters
    ----------
        training_data: data used to train the classifier. For each row, item 0 assumed to be the label
        test_data: data used to test the classifier. For each row, item 0 assumed to be the label
        k: number of nearest neighbors used in the algorithm
    
    Returns
    -------
        prediction: predicted labels of the test data
        accuracy: percent of test data labels accurately predicted
    """

    time_1 = time.time()

    estimator = neighbors.KNeighborsClassifier()

    #set up parameters for the classifier
    if (passed_parameters is None):
        parameters = {
            'n_neighbors': range(1, 11),
            'weights': ['uniform', 'distance'],
            'p': [1, 2]
        }
    else:
        parameters = passed_parameters

    #create cross validation iterator
    cv = ShuffleSplit(training_features.shape[0],
                      n_iter=5,
                      test_size=0.2,
                      random_state=0)

    #plot the validation curves
    for param in parameters:
        if (is_number(parameters[param][0])):
            title = 'Validation Curves \n(kNN)'
            save_name = "Validation Curves - kNN - %s.png" % param
            plot_validation_curve(estimator, training_features,
                                  training_labels, title, param,
                                  parameters[param], cv)
            pylab.savefig(os.path.join(results_location, save_name))

    #set up tuning algorithm
    classifier = GridSearchCV(estimator=estimator,
                              cv=cv,
                              param_grid=parameters)

    #fit the classifier
    classifier.fit(training_features, training_labels)

    test_prediction = classifier.predict(test_features)
    test_accuracy = classifier.score(test_features, test_labels)

    time_2 = time.time()

    #show the best result
    estimator = neighbors.KNeighborsClassifier(
        n_neighbors=classifier.best_estimator_.n_neighbors,
        weights=classifier.best_estimator_.weights,
        algorithm=classifier.best_estimator_.algorithm,
        leaf_size=classifier.best_estimator_.leaf_size,
        p=classifier.best_estimator_.p,
        metric=classifier.best_estimator_.metric)

    #plot the learning curve
    title = 'Learning Curves \n(k-NN, k-neighbors=%i weights=%s algorithm=%s leaf size=%i p=%i )' % (
        classifier.best_estimator_.n_neighbors,
        classifier.best_estimator_.weights,
        classifier.best_estimator_.algorithm,
        classifier.best_estimator_.leaf_size, classifier.best_estimator_.p)
    plot_learning_curve(estimator,
                        title,
                        training_features,
                        training_labels,
                        cv=cv)
    pylab.savefig(os.path.join(results_location, 'Learning Curves - kNN.png'))
    #plt.show()

    time_3 = time.time()

    #output time stats
    #time 1 -> time 2 is optimization time
    #time 2 -> time 3 is run for just one case
    print("kNN Time Stats")
    print("Optimization Time -> %f" % (time_2 - time_1))
    print("Single Run Time -> %f" % (time_3 - time_2))

    #output classification report and confusion matrix
    print('\n\n----------------------------')
    print('Classification Report')
    print('----------------------------\n')
    print(classification_report(y_true=test_labels, y_pred=test_prediction))

    print('\n\n----------------------------')
    print('Confusion Matrix')
    print('----------------------------\n')
    print(confusion_matrix(y_true=test_labels, y_pred=test_prediction))

    return test_prediction, test_accuracy
Example #52
0
def initialize(filename,
               labels_train,
               typetoread,
               toexclude=None,
               n_estimators=None,
               estimators_to_test=None,
               class_weight=None):
    """
    Takes in features and labels pertaining to a tag and fits and returns a
    TfidfVectorizer, SelectPercentile, and RandomForestClassifier
    :param filename: The base file location where information about the dataset
     can be found.
    :param labels_train: The labels to use when classifying.
    :param typetoread: The features list to use ("Use" or "Description")
    :param toexclude: A list of indices of the features list to exclude from
    classification. Useful to exclude values known to be positive or
    negative without classifier use. If not given, assumes all features are
    valid.
    :param n_estimators: The number of trees to use in the Random Forest
    Classifier as per the sklearn documentation. If not given, GridSearchCV
    will select between 50, 150, and 250.
    :param estimators_to_test: A list of different numbers of estimators to
    test using GridSearch CV as per the sklearn documentation. If not given,
    GridSearchCV will select between 50, 150, and 250.
    :param class_weight: The weightings to use for the various classes as
    per the sklearn documentation. If not given, all classes have equal weight
    :return forest: A fitted RandomForestVectorizer.
    :return vectorizer: A fitted TfidfVectorizer.
    :return selector: A fitted Selector at 10%.
    """

    features_train = pickle.load(
        open(
            os.path.abspath("../DataFiles/" + filename + "features" +
                            typetoread), "rb"))
    labels_train = pd.Series(labels_train)
    if toexclude:
        features_train = pd.Series(
            np.delete(np.array(features_train), toexclude, axis=0))
    print("Creating Vectorizer")
    vectorizer = TfidfVectorizer(stop_words="english",
                                 max_df=.5,
                                 ngram_range=(1, 3))
    print("Fitting Vectorizer")
    features_train_transformed = vectorizer.fit_transform(features_train)
    features_train = None
    print("Creating Selector")
    selector = SelectKBest(k=18000)
    print("Fitting Selector")
    selector.fit(features_train_transformed, labels_train)
    print("Transforming data")
    features_train_transformed_selected = selector.transform(
        features_train_transformed)
    features_train_transformed = None
    features_train_transformed_selected = features_train_transformed_selected.toarray(
    )
    print("Creating Forest")
    if not n_estimators:
        forest = RandomForestClassifier(min_samples_leaf=2,
                                        class_weight=class_weight)
        if not estimators_to_test:
            parameters = {
                "n_estimators": [50, 150, 250],
            }
        else:
            parameters = {
                "n_estimators": estimators_to_test,
            }
        forest = GridSearchCV(forest, parameters)
    else:
        forest = RandomForestClassifier(n_estimators=n_estimators,
                                        min_samples_leaf=2,
                                        class_weight=class_weight)
    print("Fitting Forest")
    forest.fit(features_train_transformed_selected, labels_train)
    return forest, vectorizer, selector
Example #53
0
def run_support_vector_machines(training_features,
                                training_labels,
                                test_features,
                                test_labels,
                                passed_parameters=None):
    """
    Classifies the data using sklearn's support vector machine classifier

    Parameters
    ----------
        training_data: data used to train the classifier. For each row, item 0 assumed to be the label
        test_data: data used to test the classifier. For each row, item 0 assumed to be the label
        kernel: (optional) Kernel to be used in the svm classifier can be 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'
    
    Returns
    -------
        prediction: predicted labels of the test data
        accuracy: percent of test data labels accurately predicted
    """

    time_1 = time.time()

    estimator = svm.SVC()

    #set up parameters that will be used by all kernels
    if (passed_parameters is None):
        parameters = {'C': [1e0, 5e0, 1e1, 5e1]}
    else:
        parameters = passed_parameters

    #create cross validation iterator
    cv = ShuffleSplit(training_features.shape[0],
                      n_iter=5,
                      test_size=0.2,
                      random_state=0)

    #plot the validation curves
    for param in parameters:
        if (is_number(parameters[param][0])):
            title = 'Validation Curves'
            save_name = "Validation Curves - SVC - %s.png" % param
            plot_validation_curve(estimator, training_features,
                                  training_labels, title, param,
                                  parameters[param], cv)
            pylab.savefig(os.path.join(results_location, save_name))

    #set up tuning algorithm
    classifier = GridSearchCV(estimator=estimator,
                              cv=cv,
                              param_grid=parameters)

    #fit the classifier
    classifier.fit(training_features, training_labels)

    test_prediction = classifier.predict(test_features)
    test_accuracy = classifier.score(test_features, test_labels)

    time_2 = time.time()

    #show the best result
    estimator = svm.SVC(kernel=classifier.best_estimator_.kernel,
                        C=classifier.best_estimator_.C,
                        gamma=classifier.best_estimator_.gamma,
                        degree=classifier.best_estimator_.degree)

    #plot the learning curve
    title = 'Learning Curves (SVM, kernel=%s degree=%i gamma=%f C=%i )' % (
        classifier.best_estimator_.kernel, classifier.best_estimator_.degree,
        classifier.best_estimator_.gamma, classifier.best_estimator_.C)
    plot_learning_curve(estimator,
                        title,
                        training_features,
                        training_labels,
                        cv=cv)
    save_file_name = 'Learning Curves - SVM.png'
    pylab.savefig(os.path.join(results_location, save_file_name))
    #plt.show()

    time_3 = time.time()

    if (classifier.best_estimator_.kernel == 'linear'):
        coefficients = classifier.estimator.coef_
        print('\n\n-----------------------')
        print(' Coefficients')
        print(coefficients)

    #output time stats
    #time 1 -> time 2 is optimization time
    #time 2 -> time 3 is run for just one case
    print("SVM Time Stats")
    print("Optimization Time -> %f" % (time_2 - time_1))
    print("Single Run Time -> %f" % (time_3 - time_2))

    #output classification report and confusion matrix
    print('\n\n----------------------------')
    print('Classification Report')
    print('----------------------------\n')
    print(classification_report(y_true=test_labels, y_pred=test_prediction))

    print('\n\n----------------------------')
    print('Confusion Matrix')
    print('----------------------------\n')
    print(confusion_matrix(y_true=test_labels, y_pred=test_prediction))

    return test_prediction, test_accuracy
Example #54
0
def run_boosting(training_features,
                 training_labels,
                 test_features,
                 test_labels,
                 passed_parameters=None):
    """
    Classifies the data using sklearn's ADAboost
    Does not natively support pruning so max_depth is being used for the decision tree

    Parameters
    ----------
        training_data: data used to train the classifier. For each row, item 0 assumed to be the label
        test_data: data used to test the classifier. For each row, item 0 assumed to be the label
        max_depth: maximum tree depth to be applied (will simulate pruning)
    
    Returns
    -------
        prediction: predicted labels of the test data
        accuracy: percent of test data labels accurately predicted
    """
    time_1 = time.time()

    #set up underlying decision tree classifier
    base_classifier = tree.DecisionTreeClassifier()

    #set up the boosting method
    estimator = ensemble.AdaBoostClassifier(base_estimator=base_classifier)

    #set up parameters for the classifier
    parameters = {
        'base_estimator__max_depth': range(1, 5),
        'n_estimators': range(10, 500, 50),
        'learning_rate': [.25, .5, .75, 1.0]
    }

    #create cross validation iterator
    cv = ShuffleSplit(training_features.shape[0],
                      n_iter=5,
                      test_size=0.2,
                      random_state=0)

    #plot the validation curves
    for param in parameters:
        if (is_number(parameters[param][0])):
            title = 'Validation Curves \n(AdaBoost)'
            save_name = "Validation Curves - AdaBoost - %s.png" % param
            plot_validation_curve(estimator, training_features,
                                  training_labels, title, param,
                                  parameters[param], cv)
            pylab.savefig(os.path.join(results_location, save_name))

    #set up parameters for the classifier
    if (passed_parameters is None):
        parameters = {
            'base_estimator__max_depth': range(1, 3),
            'n_estimators': range(5, 51, 5),
            'learning_rate': [1.0]
        }
    else:
        parameters = passed_parameters

    #set up tuning algorithm
    classifier = GridSearchCV(estimator=estimator,
                              cv=cv,
                              param_grid=parameters)

    #fit the classifier
    classifier.fit(training_features, training_labels)

    #get the prediction and accuracy of the test set
    test_prediction = classifier.predict(test_features)
    test_accuracy = classifier.score(test_features, test_labels)

    time_2 = time.time()

    #graph the best result
    base_classifier = tree.DecisionTreeClassifier(
        max_depth=classifier.best_estimator_.base_estimator_.max_depth)
    estimator = ensemble.AdaBoostClassifier(
        base_estimator=base_classifier,
        n_estimators=classifier.best_estimator_.n_estimators,
        learning_rate=classifier.best_estimator_.learning_rate)

    #plot the learning curve
    title = 'Learning Curves (AdaBoost - Decision Tree)\n max_depth=%i estimators=%i learning_rate=%f$' % (
        classifier.best_estimator_.base_estimator_.max_depth,
        classifier.best_estimator_.n_estimators,
        classifier.best_estimator_.learning_rate)
    plot_learning_curve(estimator,
                        title,
                        training_features,
                        training_labels,
                        cv=cv)
    pylab.savefig(
        os.path.join(results_location,
                     'Learning Curves - AdaBoost - Decision Tree.png'))

    time_3 = time.time()

    #fit the best eetimator
    estimator.fit(training_features, training_labels)

    #plot the learning curve by number of estimators
    plot_adaclassifier(estimator, classifier.best_estimator_.n_estimators,
                       training_features, test_features, training_labels,
                       test_labels)
    pylab.savefig(
        os.path.join(results_location,
                     'Estimator Curves - AdaBoost - Decision Tree.png'))

    #output time stats
    #time 1 -> time 2 is optimization time
    #time 2 -> time 3 is run for just one case
    print("Decision Tree Time Stats")
    print("Optimization Time -> %f" % (time_2 - time_1))
    print("Single Run Time -> %f" % (time_3 - time_2))

    #output classification report and confusion matrix
    print('\n\n----------------------------')
    print('Classification Report')
    print('----------------------------\n')
    print(classification_report(y_true=test_labels, y_pred=test_prediction))

    print('\n\n----------------------------')
    print('Confusion Matrix')
    print('----------------------------\n')
    print(confusion_matrix(y_true=test_labels, y_pred=test_prediction))

    return test_prediction, test_accuracy
Example #55
0
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=0)  # 数据集分割
    pipe_scv = Pipeline([('scl', StandardScaler()),
                         ('clf', SVC(random_state=1))])

    param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
    param_grid = [{
        'clf_C': param_range,
        'clf_kernel': ['linear']
    }, {
        'clf_C': param_range,
        'clf_gamma': param_range,
        'clf_kernel': ['rbf']
    }]

    gs = GridSearchCV(estimator=pipe_scv,
                      param_grid=param_grid,
                      scoring='accuracy',
                      cv=10,
                      n_jobs=-1)

    gs = gs.fit(X_train, y_train)
    print(gs.best_score_)
    print(gs.best_params_)

    # 获取最优模型进行评估
    clf = gs.best_estimator_
    clf.fit(X_train, y_train)
    y_pre = clf.score(X_test, y_test)
    print(y_pre)
Example #56
0
def run_decision_tree(training_features,
                      training_labels,
                      test_features,
                      test_labels,
                      passed_parameters=None,
                      headings=None):
    """
    Classifies the data using sklearn's decision tree 
    Does not natively support pruning so max_depth is being used

    Parameters
    ----------
        training_data: data used to train the classifier. For each row, item 0 assumed to be the label
        test_data: data used to test the classifier. For each row, item 0 assumed to be the label
        max_depth: maximum tree depth to be applied (will simulate pruning)
    
    Returns
    -------
        prediction: predicted labels of the test data
        accuracy: percent of test data labels accurately predicted
    """

    time_1 = time.time()

    estimator = tree.DecisionTreeClassifier()

    #set up parameters for the classifier
    if (passed_parameters == None):
        parameters = {'max_depth': None}
    else:
        parameters = passed_parameters

    #create cross validation iterator
    cv = ShuffleSplit(training_features.shape[0],
                      n_iter=5,
                      test_size=0.2,
                      random_state=0)

    #plot the validation curves
    for param in parameters:
        if (is_number(parameters[param][0])):
            title = 'Validation Curves \n(Decision Tree)'
            save_name = "Validation Curves - Decision Tree - %s.png" % param
            plot_validation_curve(estimator, training_features,
                                  training_labels, title, param,
                                  parameters[param], cv)
            pylab.savefig(os.path.join(results_location, save_name))

    #set up tuning algorithm
    classifier = GridSearchCV(estimator=estimator,
                              cv=cv,
                              param_grid=parameters)

    #fit the classifier
    classifier.fit(training_features, training_labels)

    test_prediction = classifier.predict(test_features)
    test_accuracy = classifier.score(test_features, test_labels)

    time_2 = time.time()

    #show the best result
    estimator = tree.DecisionTreeClassifier(
        max_depth=classifier.best_estimator_.max_depth,
        criterion=classifier.best_estimator_.criterion)
    estimator.fit(training_features, training_labels)

    #plot the learning curve
    title = 'Learning Curves \n(Decision Tree, max depth=%i)' % classifier.best_estimator_.max_depth
    plot_learning_curve(estimator,
                        title,
                        training_features,
                        training_labels,
                        cv=cv)
    pylab.savefig(
        os.path.join(results_location, 'Learning Curves - Decision Tree.png'))
    #plt.show()

    #save the visualization of the decision tree only use the top 5 levels for now
    tree_data = StringIO()
    tree.export_graphviz(estimator,
                         out_file=tree_data,
                         max_depth=5,
                         feature_names=headings)
    graph = pydot.graph_from_dot_data(tree_data.getvalue())
    graph.write_pdf(os.path.join(results_location, "Decision Tree Model.pdf"))

    time_3 = time.time()

    #output time stats
    #time 1 -> time 2 is optimization time
    #time 2 -> time 3 is run for just one case
    print("Decision Tree Time Stats")
    print("Optimization Time -> %f" % (time_2 - time_1))
    print("Single Run Time -> %f" % (time_3 - time_2))

    #output classification report and confusion matrix
    print('\n\n----------------------------')
    print('Classification Report')
    print('----------------------------\n')
    print(classification_report(y_true=test_labels, y_pred=test_prediction))

    print('\n\n----------------------------')
    print('Confusion Matrix')
    print('----------------------------\n')
    print(confusion_matrix(y_true=test_labels, y_pred=test_prediction))

    return test_prediction, test_accuracy
Example #57
0
    t_train = df[i].str.split('/', expand=True, n=1).astype(float)
    X_train = np.c_[((t_train.values)[:, 0] +
                     ((t_train.values)[:, 1] - 1988) * 12) -
                    ((d_train.values)[:, 0] +
                     ((d_train.values)[:, 1] - 1988) * 12), X_train]
    X_test = np.c_[((t_test.values)[:, 0] +
                    ((t_test.values)[:, 1] - 1988) * 12) -
                   ((d_test.values)[:, 0] +
                    ((d_test.values)[:, 1] - 1988) * 12), X_test]

imputer = Imputer()
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

clf = GradientBoostingClassifier()
param_grid = dict(n_estimators=[800],
                  max_depth=[8],
                  max_features=[0.3],
                  learning_rate=[0.1],
                  min_samples_split=[600],
                  min_samples_leaf=[40],
                  subsample=[1.],
                  random_state=[1])
grid = GridSearchCV(clf, param_grid=param_grid, cv=5, scoring='roc_auc')
grid.fit(X_train, y_train)
print("Best score %f" % grid.best_score_)
#score = cross_validation.cross_val_score(clf, X_train, y_train, cv=3, scoring='roc_auc')

Y_predict = grid.predict_proba(X_test)
np.savetxt('y_predDaniel.txt', Y_predict, fmt='%s')
Example #58
0
def run_neural_net(training_features, training_labels, test_features,
                   test_labels):
    """
    Classifies the data using pybrain's neural net

    Parameters
    ----------
        training_data: data used to train the classifier. For each row, item 0 assumed to be the label
        test_data: data used to test the classifier. For each row, item 0 assumed to be the label
        hidden_units: sets the hidden unit count for the neural net
        training_epochs: sets the training epochs for the neural net
        training_iterations: # of training loops 
    
    Returns
    -------
        prediction: predicted labels of the test data
        accuracy: percent of test data labels accurately predicted
    """

    time_1 = time.time()

    #set the number of classes in the data
    number_of_outputs = training_labels.astype(int).max() + 1
    number_of_inputs = training_features.shape[1]

    #determine optimal hidden nodes based on Huang et al. (2003)
    first_layer_nodes = int(
        math.sqrt((number_of_outputs + 2) * number_of_inputs) +
        2 * math.sqrt(number_of_inputs / (number_of_outputs + 2)))
    second_layer_nodes = int(number_of_outputs *
                             math.sqrt(number_of_inputs /
                                       (number_of_outputs + 2)))

    #set up the layers
    input_layer = mlp_nn.Layer("Linear", units=number_of_inputs)
    hidden_layer1 = mlp_nn.Layer("Sigmoid", units=first_layer_nodes)
    hidden_layer2 = mlp_nn.Layer("Sigmoid", units=second_layer_nodes)
    output_layer = mlp_nn.Layer("Softmax", units=number_of_outputs)
    layers = [input_layer, hidden_layer1, hidden_layer2, output_layer]

    #set up the classifier
    neural_net = mlp_nn.Classifier(layers=layers, learning_rate=0.02, n_iter=5)

    #set up tuning parameters
    parameters = {"learning_rate": [0.02], "n_iter": [1, 5, 10, 25, 50]}

    #create cross validation iterator
    cv = ShuffleSplit(training_features.shape[0],
                      n_iter=5,
                      test_size=0.2,
                      random_state=0)

    #set up tuning algorithm
    classifier = GridSearchCV(estimator=neural_net,
                              cv=cv,
                              param_grid=parameters)

    classifier.fit(training_features, training_labels)

    test_prediction = classifier.predict(test_features)
    test_accuracy = classifier.score(test_features, test_labels)

    time_2 = time.time()

    graph_title = "Learning Curves \n(Neural Net, learning rate=%f)" % classifier.best_estimator_.learning_rate
    plot_learning_curve_iter(classifier, graph_title)
    pylab.savefig(
        os.path.join(results_location, 'Validator Curves - Neural Net.png'))

    time_3 = time.time()

    #output time stats
    #time 1 -> time 2 is optimization time
    #time 2 -> time 3 is run for just one case
    print("Neural Net Time Stats")
    print("Optimization Time -> %f" % (time_2 - time_1))
    print("Single Run Time -> %f" % (time_3 - time_2))

    #output classification report and confusion matrix
    print('\n\n----------------------------')
    print('Classification Report')
    print('----------------------------\n')
    print(classification_report(y_true=test_labels, y_pred=test_prediction))

    print('\n\n----------------------------')
    print('Confusion Matrix')
    print('----------------------------\n')
    print(confusion_matrix(y_true=test_labels, y_pred=test_prediction))

    return test_prediction, test_accuracy
Example #59
0
# At this point, we have two options:

# 1. use more training data, to overcome low model complexity
# 2. use a more complex (lower bias) model to start with, to get more out of the existing data

params = {
    'tfidf__use_idf': (True, False),
    'bow__analyzer': (split_into_lemmas, split_into_tokens),
}

grid = GridSearchCV(
    pipeline,  # pipeline from above
    params,  # parameters to tune via cross validation
    refit=
    True,  # fit using all available data at the end, on the best found param combination
    n_jobs=-1,  # number of cores to use for parallelization; -1 for "all cores"
    scoring='accuracy',  # what score are we optimizing?
    cv=StratifiedKFold(label_train,
                       n_folds=5),  # what type of cross validation to use
)

#% time
nb_detector = grid.fit(msg_train, label_train)
print(nb_detector.grid_scores_)

print(nb_detector.predict_proba(["Hi mom, how are you?"])[0])
print(nb_detector.predict_proba(["WINNER! Credit for free!"])[0])

print(nb_detector.predict(["Hi mom, how are you?"])[0])
print(nb_detector.predict(["WINNER! Credit for free!"])[0])
Example #60
0
    data_cls = np.asarray(data_cls)
    data_pln = np.asarray(data_pln)

    X = np.vstack([data_cls, data_pln])
    y = np.concatenate([np.zeros(len(data_cls)), np.ones(len(data_pln))])

    cv = StratifiedKFold(y, n_folds=6, shuffle=True)

    cv_params = {
        "learning_rate": np.arange(0.1, 1.1, 0.1),
        'n_estimators': np.arange(1, 80, 2)
    }

    grid = GridSearchCV(AdaBoostClassifier(),
                        cv_params,
                        scoring='accuracy',
                        cv=cv,
                        n_jobs=1,
                        verbose=1)
    grid.fit(X, y)
    ada_cv = grid.best_estimator_

    scores = cross_val_score(ada_cv, X, y, cv=cv)
    scores_all[k, :] = scores

    # save the classifier
    joblib.dump(
        ada_cv,
        source_folder + "graph_data/sk_models/eigen_ada_pln_%s.plk" % band)

np.save(source_folder + "graph_data/eigen_scores_all_ada_pln.npy", scores_all)