Beispiel #1
1
def getClfScore(classifier, features, labels, cv):
    '''Evaluating performance of estimator

    param:
        classifier : classifiers list
        features   : data to fit
        labels     : samples data
        cv         : cross validation iterator
    return:
        test_score : dict of classification score
    '''
    
    test_score = {}

    for idx, clfname in enumerate(sorted(classifier.keys())):
        clf_score = {}
        clf = classifier[clfname]
        precision = cross_val_score(clf, features, labels, 'precision', cv)
        recall    = cross_val_score(clf, features, labels, 'recall', cv)
        
        clf_score['precision'] = np.mean(precision)
        clf_score['recall']    = np.mean(recall)
        
        test_score[clfname] = clf_score
    return test_score
Beispiel #2
1
    def cv(self, parameters, scoring="roc_auc"):
        """ Evaluate score by cross validation. """

        X = self.data.values.astype(np.float)
        y = self.label.values

        print cross_val_score(self.estimator, X, y, scoring=scoring, cv=3)
def test_cross_val_score_fit_params():
    clf = MockClassifier()
    n_samples = X.shape[0]
    n_classes = len(np.unique(y))

    DUMMY_INT = 42
    DUMMY_STR = "42"
    DUMMY_OBJ = object()

    def assert_fit_params(clf):
        # Function to test that the values are passed correctly to the
        # classifier arguments for non-array type

        assert_equal(clf.dummy_int, DUMMY_INT)
        assert_equal(clf.dummy_str, DUMMY_STR)
        assert_equal(clf.dummy_obj, DUMMY_OBJ)

    fit_params = {
        "sample_weight": np.ones(n_samples),
        "class_prior": np.ones(n_classes) / n_classes,
        "sparse_sample_weight": W_sparse,
        "sparse_param": P_sparse,
        "dummy_int": DUMMY_INT,
        "dummy_str": DUMMY_STR,
        "dummy_obj": DUMMY_OBJ,
        "callback": assert_fit_params,
    }
    cval.cross_val_score(clf, X, y, fit_params=fit_params)
def test_cross_val_score_fit_params():
    clf = MockClassifier()
    n_samples = X.shape[0]
    n_classes = len(np.unique(y))

    DUMMY_INT = 42
    DUMMY_STR = '42'
    DUMMY_OBJ = object()

    def assert_fit_params(clf):
        # Function to test that the values are passed correctly to the
        # classifier arguments for non-array type

        assert_equal(clf.dummy_int, DUMMY_INT)
        assert_equal(clf.dummy_str, DUMMY_STR)
        assert_equal(clf.dummy_obj, DUMMY_OBJ)

    fit_params = {'sample_weight': np.ones(n_samples),
                  'class_prior': np.ones(n_classes) / n_classes,
                  'sparse_sample_weight': W_sparse,
                  'sparse_param': P_sparse,
                  'dummy_int': DUMMY_INT,
                  'dummy_str': DUMMY_STR,
                  'dummy_obj': DUMMY_OBJ,
                  'callback': assert_fit_params}
    cval.cross_val_score(clf, X, y, fit_params=fit_params)
def test_cross_val_score_with_score_func_regression():
    X, y = make_regression(n_samples=30, n_features=20, n_informative=5,
                           random_state=0)
    reg = Ridge()

    # Default score of the Ridge regression estimator
    scores = cval.cross_val_score(reg, X, y, cv=5)
    assert_array_almost_equal(scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)

    # R2 score (aka. determination coefficient) - should be the
    # same as the default estimator score
    r2_scores = cval.cross_val_score(reg, X, y, scoring="r2", cv=5)
    assert_array_almost_equal(r2_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)

    # Mean squared error; this is a loss function, so "scores" are negative
    mse_scores = cval.cross_val_score(reg, X, y, cv=5,
                                      scoring="mean_squared_error")
    expected_mse = np.array([-763.07, -553.16, -274.38, -273.26, -1681.99])
    assert_array_almost_equal(mse_scores, expected_mse, 2)

    # Explained variance
    with warnings.catch_warnings(record=True):
        ev_scores = cval.cross_val_score(reg, X, y, cv=5,
                                         score_func=explained_variance_score)
    assert_array_almost_equal(ev_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)
Beispiel #6
0
 def fit_from_prep(self, infile):
     H, y, w = self._da.load_from_file(infile)
     self._vq = VQ(w, hist=w.shape[0])
     self._cl.fit(H, y)
     if self._verbose:
         print cross_validation.cross_val_score(
             self._cl, H, y, cv=3).mean()
Beispiel #7
0
def classifierExperiments(dataset):
	svc = svm.SVC(kernel='linear')

	scores = cross_validation.cross_val_score(svc, dataset.data, dataset.targets, cv=10)

	print "Linear kernel"
	for score in scores:
		print "Score: {0}".format(score)

	print "Mean {0:.2f} +/- {1:.2f}".format(scores.mean(), scores.std()/2)

	svc = svm.SVC(kernel='rbf')

	scores = cross_validation.cross_val_score(svc, dataset.data, dataset.targets, cv=10)

##	print "\nRBF kernel"
##	for score in scores:
##		print "Score: {0}".format(score)
##
##	print "Mean {0:.2f} +/- {1:.2f}".format(scores.mean(), scores.std()/2)

	print "\nDecision trees!"
	clf = tree.DecisionTreeClassifier()
	scores = cross_validation.cross_val_score(clf, dataset.data, dataset.targets, cv=10)

	for score in scores:
		print "Score: {0}".format(score)

	print "Mean {0:.2f} +/- {1:.2f}".format(scores.mean(), scores.std()/2)
def test_cross_val_score():
    clf = MockClassifier()
    for a in range(-10, 10):
        clf.a = a
        # Smoke test
        scores = cval.cross_val_score(clf, X, y)
        assert_array_equal(scores, clf.score(X, y))

        # test with multioutput y
        scores = cval.cross_val_score(clf, X_sparse, X)
        assert_array_equal(scores, clf.score(X_sparse, X))

        scores = cval.cross_val_score(clf, X_sparse, y)
        assert_array_equal(scores, clf.score(X_sparse, y))

        # test with multioutput y
        scores = cval.cross_val_score(clf, X_sparse, X)
        assert_array_equal(scores, clf.score(X_sparse, X))

    # test with X as list
    clf = MockListClassifier()
    scores = cval.cross_val_score(clf, X.tolist(), y)

    assert_raises(ValueError, cval.cross_val_score, clf, X, y,
                  scoring="sklearn")
Beispiel #9
0
def crossValidation():
    data2010, labels2010 = read_tac('2010')

    #classifiers
    gnb = naive_bayes.GaussianNB()
    Svm = svm.SVC(kernel = "linear")
    logReg = linear_model.LogisticRegression()

    GNBscores = cross_validation.cross_val_score(gnb, data2010, labels2010, cv=2)
    SVMscores = cross_validation.cross_val_score(Svm, data2010, labels2010, cv=2)
    logRegscores = cross_validation.cross_val_score(logReg, data2010, labels2010, cv=2)

    print "Results:"
    print "Gaussian Naive Bayes: " 
    print str(GNBscores.mean())
    print "Support Vector Machine: " 
    print str(SVMscores.mean())
    print "Logistic Regression: " 
    print str(logRegscores.mean())

    fh.write("Results:" + "\n")
    fh.write("Gaussian Naive Bayes: "  + "\n")
    fh.write(str(GNBscores.mean()) + "\n")
    fh.write("Support Vector Machine: "  + "\n")
    fh.write(str(SVMscores.mean()) + "\n")
    fh.write("Logistic Regression: "  + "\n")
    fh.write(str(logRegscores.mean()) + "\n")
    fh.write("-------------------------------------------------\n")
    fh.write("\n\n")
Beispiel #10
0
def get_score(clf, aX, aY, bX, bY, cX, cY, dX, dY):
	'''
	Get the scores for this datasets:
	- Stratified sample
	- All our data
	- group B data
	- group C data
	- group D data

	It was useful to test our best model among other people's data
	'''
	selX, selY = pick_random_values_stratified(aX, aY)
	scores1=cross_validation.cross_val_score(clf, selX, selY, cv=5)
	print "Strat %f" % (scores1.mean())
	scores2=cross_validation.cross_val_score(clf, aX, aY, cv=5)
	print "All %f" % (scores2.mean())
	clf=clf.fit(aX, aY)
	print "Group B",
	if (bX!=None and bY!=None):
		print clf.score(bX, bY)
	else:
		print "NaN"
	print "Group C",
	if (cX!=None and cX!=None):
		print clf.score(cX, cY)
	else:
		print "NaN"
	print "Group D",
	if (dX!=None and dY!=None):
		print clf.score(dX, dY)
	else:
		print "NaN"
Beispiel #11
0
def importData(datadirectory):
	#categories = ['n','u', 'y']
	categories = ['n', 'y']

	data = load_files(datadirectory,categories=categories, shuffle=True, random_state=42, encoding='latin-1') 
	X_train, X_test, y_train, y_test = cross_validation.train_test_split(data.data, data.target, test_size = 0.4, random_state=0)
	print X_train 
	# count_vect = CountVectorizer()
	# X_train_vec = count_vect.fit_transform(X_train)
	# X_test_vec = count_vect.fit_transform(X_test)
	# clf = svm.SVC(kernel='linear', C=1).fit(X_train_vec, y_train)
	# clf.score(X_test_vec, y_test) 

	text_clf = Pipeline([('vect', TfidfVectorizer()), ('clf', MultinomialNB())])
	#print text_clf.named_steps['clf']
	print str(sum(cross_val_score(text_clf, data.data,data.target ))/3.0) + ' Tfidf NB'
	#array([ 0.62376238,  0.57      ,  0.6122449 ])
	text_clf = Pipeline([('vect', CountVectorizer()),('clf', MultinomialNB()),]) 
	print str(sum(cross_val_score(text_clf, data.data,data.target ))/3.0) + ' CountVec NB'                                         #array([ 0.56435644,  0.5       ,  0.57142857])
	clf = Pipeline([('vect', CountVectorizer()), ('svm', LinearSVC())])                        
	print str(sum(cross_val_score(clf, data.data,data.target ))/3.0) + ' CountVec SVM'
	#array([ 0.55445545,  0.48      ,  0.54081633])
	clf = Pipeline([('vect', TfidfVectorizer()), ('svm', LinearSVC())])                    
	print str(sum(cross_val_score(clf, data.data,data.target ))/3.0) + ' Tfidf SVM'
	#array([ 0.62376238,  0.57      ,  0.6122449 ])
	clf_sgdc = Pipeline([('vect', CountVectorizer()),('clf', linear_model.SGDClassifier()),])
	print str(sum(cross_val_score(clf_sgdc, data.data,data.target ))/3.0) + ' SGDC' 
def svmByPackageMachineLearning(xList, yList):
	
	'''
	Example:
	
	iris = datasets.load_iris()
	clf = svm.SVC(kernel='linear', C=1)
	score = cross_validation.cross_val_score(clf, iris.data, iris.target, cv=10)
	print(score)
	print("Accuracy: %0.2f (+/- %0.2f)" % (score.mean(), score.std() * 2))
	'''
	
	#SVM with kernel
	clf_rbf = svm.SVC(decision_function_shape='ovo', C=10.0, kernel='rbf',  		gamma=5)
	clf_sig = svm.SVC(decision_function_shape='ovo', C=10.0, kernel='sigmoid',  	gamma=5, 		coef0=100.0)
	#clf_pol = svm.SVC(decision_function_shape='ovo', C=10.0, kernel='polynomial', 	gamma=5,		coef0=100.0, degree=4)
	clf_lin = svm.SVC(decision_function_shape='ovo', C=10.0, kernel='linear')

	
	#cross validation 
	score_rbf = cross_validation.cross_val_score(clf_rbf, xList, yList, cv=10)
	score_sig = cross_validation.cross_val_score(clf_sig, xList, yList, cv=10)
	#score_pol = cross_validation.cross_val_score(clf_pol, xList, yList, cv=10)
	score_lin = cross_validation.cross_val_score(clf_lin, xList, yList, cv=10)
	
	
	print("rbf: %0.2f (+/- %0.2f)" % (score_rbf.mean(), score_rbf.std() * 2))
	print("sig: %0.2f (+/- %0.2f)" % (score_sig.mean(), score_sig.std() * 2))
	#print("pol: %0.2f (+/- %0.2f)" % (score_pol.mean(), score_pol.std() * 2))
	print("lin: %0.2f (+/- %0.2f)" % (score_lin.mean(), score_lin.std() * 2))
def test_cross_val_score_filter_feature_selection_threshold():

    threshold = 1.0
    scikit_data,scikit_target = dfm.get_expression_scikit_data_target(expression_file, ic50_file,normalized=True,trimmed=True,threshold=None)
    model = classify.construct_svc_model(kernel='linear')
    non_thresholded_test_1 = cv.cross_val_score_filter_feature_selection(model,cv.trim_X_threshold,threshold,scikit_data,scikit_target,cv=5)

    m = classify.construct_svc_model(kernel='linear')
    s_data,s_target = dfm.get_expression_scikit_data_target(expression_file, ic50_file,normalized=True,trimmed=True,threshold=threshold)
    non_thresholded_test_2 = cross_val_score(m,s_data,s_target,cv=5)

    threshold = .05
    scikit_data,scikit_target = dfm.get_expression_scikit_data_target(expression_file, ic50_file,normalized=True,trimmed=True,threshold=None)
    model = classify.construct_svc_model(kernel='linear')
    thresholded_test_1 = cv.cross_val_score_filter_feature_selection(model,cv.trim_X_threshold,threshold,scikit_data,scikit_target,cv=5)

    m = classify.construct_svc_model(kernel='linear')
    s_data,s_target = dfm.get_expression_scikit_data_target(expression_file, ic50_file,normalized=True,trimmed=True,threshold=threshold)
    thresholded_test_2 = cross_val_score(m,s_data,s_target,cv=5)


    #The non-thresholded tests should be the same because if we are not thresholding, it doesn't matter where we perform thresholding
    assert(math.fabs(non_thresholded_test_1.mean() - non_thresholded_test_2.mean()) < .001)

    #The first non_thresholded test should have lower accuracy because we are doing thresholding within the cross-validation,
    #which will reduce cross-validation overfitting and as a consequence reported cross-validation accuracy.
    assert(thresholded_test_1.mean() - thresholded_test_2.mean() < 0)
Beispiel #14
0
def training(matrix, Y, SVM):
    """ def  training(matrix , Y , svm ):
			matrix: is the train data
			Y: is the labels in array
			svm: is a boolean. If svm == True we perform svm otherwise we perform AdaBoostClassifier

			return: cross_validation scores
	"""

    if SVM:
        classifier = svm.SVC()
    else:
        classifier = AdaBoostClassifier(n_estimators=300)

    precision_micro_scorer = metrics.make_scorer(custom_precision_micro_score)
    precision_macro_scorer = metrics.make_scorer(custom_precision_macro_score)
    recall_micro_scorer = metrics.make_scorer(custom_recall_micro_score)
    recall_macro_scorer = metrics.make_scorer(custom_recall_macro_score)

    precision_micro = cross_val_score(classifier, matrix, Y, cv=10, scoring=precision_micro_scorer)
    precision_macro = cross_val_score(classifier, matrix, Y, cv=10, scoring=precision_macro_scorer)
    recall_micro = cross_val_score(classifier, matrix, Y, cv=10, scoring=recall_micro_scorer)
    recall_macro = cross_val_score(classifier, matrix, Y, cv=10, scoring=recall_macro_scorer)

    return {"micro": (precision_micro, recall_micro), "macro": (precision_macro, recall_macro)}
def dofitSVMstd(X_train, Y_train, X_test):
    shape = X_train.shape
    b = []
    for j in range(shape[0]):
        a1 = [np.std(X_train[j, :, i]) for i in range(shape[2])]
        a2 = [getEntropy(list(X_train[j, :, i].astype(int))) for i in range(shape[2])]
        a1.sort(reverse=True)
        a2.sort()
        b.append(a1[0:16] + a2[0:16])

    x1 = np.array(b)
    clf = RandomForestClassifier()
    dummy = clf.fit(x1, Y_train)
    scores = cross_validation.cross_val_score(clf, x1, Y_train)
    p1 = clf.predict(x1)
    shape = X_test.shape
    b = []
    for j in range(shape[0]):
        a1 = [np.std(X_test[j, :, i]) for i in range(shape[2])]
        a2 = [getEntropy(list(X_test[j, :, i].astype(int))) for i in range(shape[2])]
        a1.sort(reverse=True)
        a2.sort()
        b.append(a1[0:16] + a2[0:16])

    x2 = np.array(b)
    y2 = clf.predict(x2)
    xx = np.concatenate((x1, x2))
    yy = np.concatenate((Y_train, y2))
    dummy = clf.fit(xx, yy)
    p2 = clf.predict(x2)
    scores = cross_validation.cross_val_score(clf, x1, Y_train)
    # sum(clf.predict(x2))
    return [scores, np.concatenate((p1, p2))]
def test_cross_val_score_allow_nans():
    # Check that cross_val_score allows input data with NaNs
    X = np.arange(200, dtype=np.float64).reshape(10, -1)
    X[2, :] = np.nan
    y = np.repeat([0, 1], X.shape[0] / 2)
    p = Pipeline([("imputer", Imputer(strategy="mean", missing_values="NaN")), ("classifier", MockClassifier())])
    cval.cross_val_score(p, X, y, cv=5)
Beispiel #17
0
def analytics():
    trainer_data = get_thing_from_file("training_dataset.txt")
    tester_data = get_thing_from_file("test_dataset.txt")

    bayes_clf = get_thing_from_file("bayes_model.txt")
    svm_clf = get_thing_from_file("svm_model.txt")

    # we load the fitted models from file so we don't need these lines
    # bayes_clf.fit(trainer_data.data, trainer_data.target)
    # svm_clf.fit(trainer_data.data, trainer_data.target)

    test = tester_data.data

    predicted_bayes = bayes_clf.predict(test)
    predicted_svm = svm_clf.predict(test)

    print "** ACCURACIES **"
    print numpy.mean(predicted_bayes == tester_data.target)
    print numpy.mean(predicted_svm == tester_data.target)

    print "** K-FOLD VALIDATION ACCURACY"

    bayes_scores = cross_validation.cross_val_score(bayes_clf,
                                    tester_data.data, tester_data.target, cv=10)

    svm_scores = cross_validation.cross_val_score(svm_clf, tester_data.data,
                                              tester_data.target, cv=10)

    print max(bayes_scores)
    print max(svm_scores)
    print "**"
Beispiel #18
0
def svm_classify(std_features, surf_features, labels):
    score_std = cross_validation.cross_val_score(svm.SVC(), std_features, labels, cv=5)
    print('Accuracy (5 fold x-val) with svm [std features]: %s%%' % (0.1* round(1000*score_std.mean())))
    
    # do logistic regression with SURF features
    print('predicting...')
    scoreSURFlr = cross_validation.cross_val_score(
            svm.SVC(), surf_features, 
            labels, cv=5).mean()
    print('Accuracy (5 fold x-val) with svm [SURF features]: %s%%' % (0.1* round(1000*scoreSURFlr.mean())))
    
    # do logistic regression on the combined features
    print('Performing log. regression using combined features...')
    allfeatures = np.hstack([surf_features, std_features])
    score_combined = cross_validation.cross_val_score(svm.SVC(), allfeatures, labels, cv=5).mean()
    print('Accuracy (5 fold x-val) with svm [All features]: %s%%' % (0.1* round(1000*score_combined.mean())))
    
    # plotting
    #style.use('ggplot')
    fig = plt.figure()
    fig.suptitle('SVM', fontsize=20)
    plt.plot([0,1,2],100*np.array([score_std.mean(), scoreSURFlr, score_combined]), 'k-', lw=8)
    plt.plot([0,1,2],100*np.array(
        [score_std.mean(), scoreSURFlr, score_combined]), 
        'o', mec='#cccccc', mew=12, mfc='white')
    plt.xlim(-.5,2.5)
    plt.ylim(score_std.mean()*90., score_combined*110)
    plt.xticks([0,1,2], ["Standard", "SURF", "Combined"])
    plt.ylabel('Accuracy (%)')
    plt.savefig('img_classifying_graph_svm.png')
def crossValidateRegression(X, y, C, epsilon):
    svr = svm.SVR(kernel='linear', C=C, epsilon=epsilon)
    crossValidator = cross_validation.KFold(NUM_COUNTRYS, n_folds = 10)
    cross_validation.cross_val_score(svr, X, y, cv=crossValidator, scoring='mean_squared_error')
    svr.fit(X, y)
    predictedTargetInfo = svr.predict(X)
    return predictedTargetInfo
Beispiel #20
0
def rf_cross_val(x,y):
    X_train, X_test, y_train, y_test = train_test_split(x,y,test_size = 0.33, random_state = 42)

    random_forest_grid = {'n_estimators': [100],
                                    'n_jobs': [-1]}

    rf_gridsearch = GridSearchCV(RandomForestRegressor(),
                                 random_forest_grid,
                                 n_jobs=-1,
                                 verbose=True,
                                 cv=3)

    rf_gridsearch.fit(X_train, y_train)

    print "best parameters:", rf_gridsearch.best_params_

    best_rf_model = rf_gridsearch.best_estimator_

    y_pred = best_rf_model.predict(X_test)

    print "Accuracy with best rf:", cross_val_score(best_rf_model, X_test, y_test).mean()

    rf = RandomForestRegressor(n_estimators=10, n_jobs = -1)

    print "Accuracy with default param rf:", cross_val_score(rf, X_test, y_test).mean()
    return best_rf_model
	def crossValidation(self, classification):
		print "##########  Cross Validating and Testing ##########\n\n\n"
		print "##########  Results. ##########\n\n\n"
		scores = cross_validation.cross_val_score(classification, self.X_train, self.y_train, metrics.f1_score, cv=5, n_jobs=1)
		print "F1-score: %.2f (+/- %.2f)" % (scores.mean(), scores.std()/2)
		scores = cross_validation.cross_val_score(classification, self.X_train, self.y_train, metrics.accuracy_score, cv=5, n_jobs=1)
		print "Accuracy: %.2f (+/- %.2f)" % (scores.mean(), scores.std()/2)
Beispiel #22
0
    def lda_run(self, k_folds = 5):
        self.r_forest_lda = RandomForestClassifier(n_estimators=2000,n_jobs=5, max_depth=None, min_samples_split=1, random_state =0)
        self.lda_scores = cross_validation.cross_val_score(self.r_forest_lda, self.lda_iss_features, self.labels, cv=k_folds,n_jobs=5)
        print("Cross validation Random Forest performance LDA: Accuracy: %0.2f (std %0.2f)" % (self.lda_scores.mean()*100, self.lda_scores.std()*100))
        self.r_forest_lda.fit(self.lda_iss_features,self.labels)
        print self.r_forest_lda.score(self.lda_iss_validation_features, self.validation_labels)*100, 'LDA test-set performance \n'

        '''
        C_range = np.logspace(-2, 10, 13)
        gamma_range = np.logspace(-9, 3, 13)
        param_grid = dict(gamma=gamma_range, C=C_range)
        cv = StratifiedShuffleSplit(self.labels, n_iter=5, test_size=0.2, random_state=42)
        grid = GridSearchCV(SVC(), param_grid=param_grid, cv=cv)
        grid.fit(self.lda_iss_features, self.labels)
        print("The best parameters are %s with a score of %0.2f"% (grid.best_params_, grid.best_score_))
        '''

        self.svc_lda = SVC(kernel='rbf',C = 1,gamma = 'auto')
        self.svc_lda_scores = cross_validation.cross_val_score(self.svc_lda, self.lda_iss_features, self.labels, cv=k_folds,n_jobs=5)
        print("Cross validation SVM performance LDA: Accuracy: %0.2f (std %0.2f)" % (self.svc_lda_scores.mean()*100, self.svc_lda_scores.std()*100))
        self.svc_lda.fit(self.lda_iss_features,self.labels)
        print self.svc_lda.score(self.lda_iss_validation_features, self.validation_labels)*100, 'LDA test-set performance \n'

        y_true = self.validation_labels
        y_pred = self.svc_lda.predict(self.lda_iss_validation_features)
        target_names = ['S1','S2','S3','S4']
        t = classification_report(y_true, y_pred, target_names=target_names)
        print 'Support vector report lda'
        print t
Beispiel #23
0
def validate_model(model, features, labels):
    accuracy = cross_val_score(model, features, labels, scoring='accuracy', cv=4).mean()
    precision = cross_val_score(model, features, labels, scoring='precision', cv=4).mean()
    recall = cross_val_score(model, features, labels, scoring='recall', cv=4).mean()
    f1 = cross_val_score(model, features, labels, scoring='f1', cv=4).mean()
    print "\n(METRICS) Accuracy: {:.3f}   Precision: {:.3f}   Recall: {:.3f}   F1-Score: {:.3f}".\
        format(accuracy,precision, recall, f1)
Beispiel #24
0
def run_model(data):
    """Do some label bucketing, print model output."""
    features = data.ix[:, :-1]

    # more categories <--> less accuracy
    # labels = data.ix[:, -1].map(lambda k: 1 if k > 10 else 0)
    labels = data.ix[:, -1].map(lambda k: int(k / 5))     # bucketing trick
    print 'num classes = {}\n'.format(len(set(labels)))

    # weak (base) classifier
    print 'fitting weak classifier...'
    weak_clf = DecisionTreeClassifier(max_depth=MAX_DEPTH)

    weak_cv_results = cross_val_score(weak_clf, features, labels,
        cv=N_FOLDS)
    print 'weak_cv_results = {}'.format(weak_cv_results)
    print 'avg accuracy = {}\n'.format(weak_cv_results.mean())
    
    # strong (ensemble) classifier
    print 'fitting strong classifier...'
    strong_clf = RandomForestClassifier(
        max_depth=MAX_DEPTH,
        n_estimators=N_TREES,
        n_jobs=N_JOBS)

    strong_cv_results = cross_val_score(strong_clf, features, labels,
        cv=N_FOLDS)
    print 'strong_cv_results = {}'.format(strong_cv_results)
    print 'avg accuracy = {}'.format(strong_cv_results.mean())
Beispiel #25
0
def coeff_of_deterimination(classifier, X, y, K=10):
    # Perform a cross-validation estimate of the coefficient of determination using
    # the cross_validation module using all CPUs available on the machine
    R21 = cross_val_score(classifier, X, y=y, n_jobs=1).mean()
    R2 = cross_val_score(classifier, X, y=y, cv=KFold(y.size, K), n_jobs=1).mean()
    print "The %d-Folds est coeff. of determ. R2 = %s" % (K, R2)
    print "basic cross val ", R21
Beispiel #26
0
def _run_classifier(X, Y, parent, child, max_depth):
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.333, random_state=0)
    clf = tree.DecisionTreeClassifier(min_samples_split=parent, min_samples_leaf=child, max_depth=max_depth)
    clf = clf.fit(X_train, y_train)

    print 'model score on train data data:'
    print clf.score(X_train, y_train)
    print 'ten fold cross-validation results on train data:'
    scores = cross_val_score(clf, X_train, y_train, cv=10)
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

    print 'model score on test data'
    print clf.score(X_test, y_test)
    print 'ten fold cross-validation results on test data:'
    scores = cross_val_score(clf, X_test, y_test, cv=10)
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

    print 'Gini Importance'
    print clf.feature_importances_

    'Classification Report'
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))

    'Confusion Matrix'
    print(confusion_matrix(y_true, y_pred))
    cm = confusion_matrix(y_true, y_pred)
    print _calc_error_rate_conf_int(cm)
    return _calc_error_rate_conf_int(cm) + [len(y_test)]
def run_conventional_linkage(x, y, n_comps, linker_model, verbose=0, k_folds=3):
  print "---->Cross validating"
  cvs = cross_val_score(linker_model, x, y, cv=k_folds, scoring='r2', verbose=verbose)
  mse = cross_val_score(linker_model, x, y, cv=k_folds, scoring='mean_squared_error', verbose=verbose)
  print '---->R2: ', np.mean(cvs)
  print '---->MSE: ', np.mean(mse)
  return np.mean(cvs), np.std(cvs), np.mean(mse), np.std(mse)
Beispiel #28
0
def experiment_zero(data,company):
	print '___Experiment One___'
	# Experiment Parameters
	finance_datatype = 0    # finance_datatype: Integer 2 = Stock price change, 1 = Percentage stock price change, 0 = Only direction
	finance_n = 2           # finance_n: Integer >=0 Number of days of finance data to include
	sentiment_datatype = 1	# sentiment_datatype: Boolean 1 = all sentiment featues, 0 = Total
	sentiment_n = 1 		# sentiment_n: Integer >=0 Number of days of sentiment data to include
	day = 0                 # day: Boolean 1 = Include day of the week, 0 = do not
	target = 0				# target: Boolean 1 = Amount, 0 = Direction
	volume = 0 				# volume: boolean 1 = Yes, 0 = No
	if (finance_n + sentiment_n + day + volume) == 0:
		print 'Insufficient parameters set'
		return 

	# Data Processing
	feature_vector_meaning(company, finance_datatype, finance_n, sentiment_datatype, sentiment_n, day, target, volume)
	matrix = create_feature_matrix(company, data, finance_datatype, finance_n, sentiment_datatype, sentiment_n, day, target, volume)
	end = len(matrix[0])
	train_x = matrix[:,0:end-1]
	train_y = matrix[:,end-1]

	# Classifier training
	scaler = preprocessing.StandardScaler().fit(train_x)
	train_x = scaler.transform(train_x)

	clf = direction_classifier(train_x,train_y)
	cv = cross_validation.ShuffleSplit(len(train_x), n_iter=5, test_size=0.2, random_state=0)
	print ' _ _ _Evaluation_ _ _'
	if target == 0:
		scores = cross_validation.cross_val_score(clf, train_x, train_y, cv=cv, scoring='accuracy')
		print("  Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
	elif target == 1:
		scores = cross_validation.cross_val_score(clf, train_x, train_y, cv=cv, scoring='mean_squared_error')
		print("  MSE: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))		
	print '====================='
Beispiel #29
0
    def randomforest_info(self, max_trees = 1000, step = 40, k_folds = 5):
        print 'Characterising R_forest. Looping through trees: ',
        self.treedata = np.zeros((max_trees/step, 10))
        for i,n_trees in enumerate(np.arange(0, max_trees,step)):
            if n_trees == 0:
                n_trees = 1
            print n_trees,
            r_forest = RandomForestClassifier(n_estimators=n_trees, n_jobs=5, max_depth=None, min_samples_split=1, random_state=0)
            scores = cross_validation.cross_val_score(r_forest, self.iss_features, self.labels, cv=k_folds,n_jobs=5)
            r_forest_full = RandomForestClassifier(n_estimators=n_trees, n_jobs=5, max_depth=None, min_samples_split=1, random_state=0)
            r_forest_full.fit(self.iss_features,self.labels)
            self.treedata[i,0] = n_trees
            self.treedata[i,1] = scores.mean()
            self.treedata[i,2] = scores.std()
            # now add the test dataset - score
            self.treedata[i,3] = r_forest_full.score(self.iss_validation_features, self.validation_labels)

            r_forest_lda = RandomForestClassifier(n_estimators=n_trees, n_jobs=5, max_depth=None, min_samples_split=1, random_state=0)
            r_forest_lda_full = RandomForestClassifier(n_estimators=n_trees, n_jobs=5, max_depth=None, min_samples_split=1, random_state=0)
            r_forest_lda_full.fit(self.lda_iss_features,self.labels)
            lda_scores = cross_validation.cross_val_score(r_forest_lda, self.lda_iss_features, self.labels, cv=k_folds,n_jobs=5)
            self.treedata[i,4] = lda_scores.mean()
            self.treedata[i,5] = lda_scores.std()
            self.treedata[i,6] = r_forest_lda_full.score(self.lda_iss_validation_features, self.validation_labels)
            print self.treedata[i,6]

            r_forest_pca = RandomForestClassifier(n_estimators=n_trees, n_jobs=5, max_depth=None, min_samples_split=1, random_state=0)
            r_forest_pca_full = RandomForestClassifier(n_estimators=n_trees, n_jobs=5, max_depth=None, min_samples_split=1, random_state=0)
            r_forest_pca_full.fit(self.pca_iss_features,self.labels)
            pca_scores = cross_validation.cross_val_score(r_forest_pca, self.pca_iss_features, self.labels, cv=k_folds,n_jobs=5)
            self.treedata[i,7] = pca_scores.mean()
            self.treedata[i,8] = pca_scores.std()
            self.treedata[i,9] = r_forest_pca_full.score(self.pca_iss_validation_features, self.validation_labels)
def test_cross_val_score_fit_params():
    clf = MockClassifier()
    n_samples = X.shape[0]
    n_classes = len(np.unique(y))
    fit_params = {'sample_weight': np.ones(n_samples),
                  'class_prior': np.ones(n_classes) / n_classes}
    cval.cross_val_score(clf, X, y, fit_params=fit_params)
Beispiel #31
0
# We have d dimensions, d=64
# We have z classes, z=6, [digit0, digit1, digit2, digit7, digit8, digit9]
lbl = preprocessing.LabelEncoder()
y_train = lbl.fit_transform(y[np.where((y == 0) | (y == 1) | (y == 2)
                                       | (y == 7) | (y == 8) | (y == 9))])
X_train = X[np.where((y == 0) | (y == 1) | (y == 2) | (y == 7) | (y == 8)
                     | (y == 9))]

# We have Weight matrix, W, d x z
model = linear_model.LogisticRegression(random_state=1)
model.fit(X_train, y_train)
W = model.coef_.T

print cross_validation.cross_val_score(model,
                                       X_train,
                                       y_train,
                                       scoring=make_scorer(accuracy_score))

# We have a attributes, a=4 [pca_d1, pca_d2, lle_d1, lle_d2]
# We have Signature matrix, S a x z
pca = decomposition.PCA(n_components=2)
lle = manifold.LocallyLinearEmbedding(n_components=2, random_state=1)
X_pca = pca.fit_transform(X_train)
X_lle = lle.fit_transform(X_train)

for i, ys in enumerate(np.unique(y_train)):
    if i == 0:
        S = np.r_[np.mean(X_pca[y_train == ys], axis=0),
                  np.mean(X_lle[y_train == ys], axis=0)]
    else:
        S = np.c_[S, np.r_[np.mean(X_pca[y_train == ys], axis=0),
Beispiel #32
0
	# 			TN += 1
	# 	elif (y_actual[i] != y_hat[i]):
	# 		if (y_actual[i] == 1):
	# 			FN += 1
	# 		else:
	# 			FP += 1
	# 	else:
	# 		print "Actual", y_actual[i], y_hat[i]
	return (TP/len(y_hat), FP/len(y_hat), TN/len(y_hat), FN/len(y_hat))

for i in xrange(itiration):
	# clf = Random(strategy='uniform')
	clf = LR() # 0.60 +/- 0.24
	np.random.shuffle(dataSet)
	# print "Dataset", dataSet
	scores = cross_validation.cross_val_score(clf, dataSet[:, 2:], dataSet[:, 1], cv=10)
	print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
	predict = cross_validation.cross_val_predict(clf, dataSet[:, 2:], dataSet[:, 1], cv=10)
	print zip(predict,dataSet[:, 1], dataSet[:, 0])
	dTP, dFP, dTN, dFN = perf_measure(dataSet[:, 1], predict)
	TP += dTP
	FP += dFP
	TN += dTN
	FN += dFN

	
	av_mean += scores.mean()
	

print "Average accuracy is", av_mean/itiration
print "Rate", TP/itiration, FP/itiration, TN/itiration, FN/itiration
Beispiel #33
0
    'variance': pca.explained_variance_,
    'principal component': pca_df.columns.tolist()
})

# adding one to pricnipal componetns (since there is no 0th compeonet)
variance_df['principal component'] = variance_df['principal component'] + 1
variance_df.plot(x='principal component', y='variance')
#  looks like variance stops getting explained after first two components

pca_df_small = pca_df.ix[:, 0:1]

# getting a cross val score of transformed data
rf = ensemble.RandomForestClassifier(n_estimators=500)
roc_scores_rf_pca = cross_val_score(rf,
                                    pca_df_small,
                                    response_series,
                                    cv=10,
                                    scoring='roc_auc')

print roc_scores_rf_pca.mean()
# 74% accuracy

roc_scores_rf = cross_val_score(rf,
                                explanatory_df,
                                response_series,
                                cv=10,
                                scoring='roc_auc')
print roc_scores_rf.mean()
# PCA created significant information loss in this case

############################
Beispiel #34
0

numpy.random.seed(seed)
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp',
                   KerasClassifier(build_fn=create_baseline,
                                   nb_epoch=300,
                                   batch_size=16,
                                   verbose=0)))
pipeline = Pipeline(estimators)
kfold = StratifiedKFold(y=encoded_Y,
                        n_folds=10,
                        shuffle=True,
                        random_state=seed)
results = cross_val_score(pipeline, X, encoded_Y, cv=kfold)
print("Accuracy: %.2f%% (%.2f%%)" %
      (results.mean() * 100, results.std() * 100))

# Accuracy: 82.68% (3.90%)


# dropout in the input layer with weight constraint
def create_model1():
    # create model
    model = Sequential()
    model.add(Dropout(0.2, input_shape=(60, )))
    model.add(
        Dense(60, init='normal', activation='relu', W_constraint=maxnorm(3)))
    model.add(
        Dense(30, init='normal', activation='relu', W_constraint=maxnorm(3)))
Beispiel #35
0
def analysis():
    # Delete existing data in table
    db.session.query(Results).delete()
    db.session.commit()

    # ABTs for each model
    df_lr = abt(normal=False).copy()
    df_log = abt(normal=False, convert=True).copy(
    )  # normal is false because it removes a category group
    df_gnb = abt(normal=False, convert=True).copy(
    )  # normal is false because it removes a category group
    df_knn = abt(normal=False, convert=True).copy(
    )  # normal is false because it removes a category group
    df_svm = abt(normal=False, convert=True).copy(
    )  # normal is false because it removes a category group
    ####################################################
    ##### Model 1: Simple Linear Regression ############
    ####################################################
    lr = LinearRegression()  # model object

    X = lin_exp_var(df_lr)  # Explanatory Variable
    y = df_lr["min_occ_reg"]  # Dependent Variable
    # Fit model, predict and cross eval
    clf = lr.fit(X, y)
    lr_model = pickle.dumps(clf)

    # Create dictionary of accuracy measures
    pred_lr = clf.predict(X)
    cross_val_scores = cross_val_score(lr, X.astype(int), y.astype(int), cv=5)
    lr_dict = {
        "Mean Squared Error":
        metrics.mean_squared_error(y, pred_lr),
        "Root Mean Squared Error":
        np.sqrt(metrics.mean_squared_error(y, pred_lr)),
        "R squared":
        lr.score(X, y),
        "Cross Val. Accuracy (+/- %0.2f)" % (cross_val_scores.std()):
        cross_val_scores.mean()
    }
    # Commit to database
    LinearModel = Results(model_type="Simple Linear Regression",
                          model=lr_model,
                          accuracy=str(lr_dict))
    db.session.add(LinearModel)
    db.session.commit()

    ####################################################
    ##### Model 2: Multinomial Logistic Regression #####
    ####################################################
    log = LogisticRegression()

    X = log_exp_var(df_log)  # Explanatory Variable:
    y = df_log["occupancy"]  # Dependent Variable
    clf = log.fit(X, y)
    log_model = pickle.dumps(clf)  # variable containing model object

    # Create dictionary of accuracy measures
    pred_log = clf.predict(X)
    scores = cross_val_score(log, X, y, cv=5)
    log_dict = {
        "Accuracy Classification Score": metrics.accuracy_score(y, pred_log),
        "Precision Score": metrics.precision_score(y,
                                                   pred_log,
                                                   average="macro"),
        "Recall Score": metrics.recall_score(y, pred_log, average="macro"),
        "F-score": metrics.f1_score(y, pred_log, average="macro"),
        "Cross Val. Accuracy (+/- %0.2f)" % (scores.std() * 2): scores.mean()
    }
    # Commit to database
    Logistic_Regression = Results(model_type="Multinomial Logistic Regression",
                                  model=log_model,
                                  accuracy=str(log_dict))
    db.session.add(Logistic_Regression)
    db.session.commit()

    ####################################################
    ##### Model 3: Gaussian Naive Bayes ################
    ####################################################
    gnb = GaussianNB()

    X = gnb_exp_var(df_gnb)
    y = df_gnb["occupancy"]  # Dependent Variable
    # Fit model, predict and cross eval
    clf = gnb.fit(X, y)  # Fit the model
    gnb_model = pickle.dumps(clf)

    # Create dictionary of accuracy measures
    pred_gnb = clf.predict(X)
    scores = cross_val_score(gnb, X, y, cv=5)
    gnb_dict = {
        "Accuracy Classification Score": metrics.accuracy_score(y, pred_gnb),
        "Precision Score": metrics.precision_score(y,
                                                   pred_gnb,
                                                   average="macro"),
        "Recall Score": metrics.recall_score(y, pred_gnb, average="macro"),
        "F-score": metrics.f1_score(y, pred_gnb, average="macro"),
        "Cross Val. Accuracy (+/- %0.2f)" % (scores.std() * 2): scores.mean()
    }
    # Commit to database
    Gaussian_NB = Results(model_type="Gaussian Naive Bayes",
                          model=gnb_model,
                          accuracy=str(gnb_dict))
    db.session.add(Gaussian_NB)
    db.session.commit()

    ####################################################
    ##### Model 4: k-Nearest Neighbor ##################
    ####################################################
    X = knn_exp_var(df_knn)  # Explanatory Variable:
    y = df_knn["occupancy"]  # Dependent Variable
    # Find best number of neighbours based on train and test scores
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.4,
                                                        random_state=42)
    scores = []
    for n in range(2, 15):
        test = KNeighborsClassifier(n_neighbors=n).fit(X_train, y_train)
        scores.append(round(test.score(X_test, y_test), 2))
    neighbours = scores.index(max(scores)) + 2  # index 0 is k=2
    knn = KNeighborsClassifier(n_neighbors=neighbours)

    # Fit model, predict and cross eval
    clf = knn.fit(X, y)
    knn_model = pickle.dumps(clf)  # variable containing model object

    # Create dictionary of accuracy measures
    pred_knn = clf.predict(X)
    scores = cross_val_score(knn, X, y, cv=5)
    knn_dict = {
        "Accuracy Classification Score": metrics.accuracy_score(y, pred_knn),
        "Precision Score": metrics.precision_score(y,
                                                   pred_knn,
                                                   average="macro"),
        "Recall Score": metrics.recall_score(y, pred_knn, average="macro"),
        "F-score": metrics.f1_score(y, pred_knn, average="macro"),
        "Cross Val. Accuracy (+/- %0.2f)" % (scores.std() * 2): scores.mean()
    }
    # Commit to database
    KNeighbors_Classifier = Results(model_type="k-Nearest Neighbor",
                                    model=knn_model,
                                    accuracy=str(knn_dict))
    db.session.add(KNeighbors_Classifier)
    db.session.commit()

    ####################################################
    ##### Model 5: Support Vector Machines #############
    ####################################################
    X = svm_exp_var(df_svm)  # Explanatory Variable
    y = df_svm["occupancy"]  # Dependent Variable
    # Find best number of for gamma based on train and test scores
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.4,
                                                        random_state=42)
    scores = []
    for n in range(0, 20):
        test = SVC(gamma=n).fit(X_train, y_train)
        scores.append(round(test.score(X_test, y_test), 2))
    gamma = scores.index(max(scores))
    svc = SVC(gamma=gamma)
    # Fit model, predict and cross eval
    clf = svc.fit(X, y)  # Fit the model
    svc_model = pickle.dumps(clf)  # variable containing model object

    # Create dictionary of accuracy measures
    pred_svc = clf.predict(X)
    scores = cross_val_score(svc, X, y, cv=5)
    svc_dict = {
        "Accuracy Classification Score": metrics.accuracy_score(y, pred_svc),
        "Precision Score": metrics.precision_score(y,
                                                   pred_svc,
                                                   average="macro"),
        "Recall Score": metrics.recall_score(y, pred_svc, average="macro"),
        "F-score": metrics.f1_score(y, pred_svc, average="macro"),
        "Cross Val. Accuracy (+/- %0.2f)" % (scores.std() * 2): scores.mean()
    }
    # Commit to database
    SVC_results = Results(model_type="Support Vector Machines",
                          model=svc_model,
                          accuracy=str(svc_dict))
    db.session.add(SVC_results)
    db.session.commit()
df.corr()

# Lets go and get our hands dirty
df = pd.read_csv('winequality-red.csv', sep=';')
X = df[list(df.columns)[:-1]]
y = df['quality']

# train_test_split function to randomly partition the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y)
regressor = LinearRegression()
regressor.fit(X_train, y_train)
y_predictions = regressor.predict(X_test)
print 'R-squared:', regressor.score(X_test, y_test)

# lets get cross valiidation score
scores = cross_val_score(regressor, X, y, cv=10)
print scores.mean(), scores

# plot with true quality and predicted quality
plt.scatter(y_predictions, y_test)
plt.xlabel('Predicted quality')
plt.ylabel('True quality')
plt.title('Predicted quality Vs  True Quality')
plt.show()

# fitting models using gradient decent
# SGDRegressor is an implementation of SGD that can be used even for
# regression problems with more features. It can be used
# to optimize different cost functions to fit different linear models. By default, it will
# optimize the residual sum of squares
Beispiel #37
0
labels = epochs.events[:, -1] - 2

# cross validation
cv = KFold(len(labels), 10, shuffle=True, random_state=42)
# get epochs
epochs_data_train = 1e6 * epochs.get_data()

# compute covariance matrices
cov_data_train = Covariances().transform(epochs_data_train)

###############################################################################
# Classification with Minimum distance to mean
mdm = MDM(metric=dict(mean='riemann', distance='riemann'))

# Use scikit-learn Pipeline with cross_val_score function
scores = cross_val_score(mdm, cov_data_train, labels, cv=cv, n_jobs=1)

# Printing the results
class_balance = np.mean(labels == labels[0])
class_balance = max(class_balance, 1. - class_balance)
print("MDM Classification accuracy: %f / Chance level: %f" %
      (np.mean(scores), class_balance))

###############################################################################
# Classification with Tangent Space Logistic Regression
clf = TSclassifier()
# Use scikit-learn Pipeline with cross_val_score function
scores = cross_val_score(clf, cov_data_train, labels, cv=cv, n_jobs=1)

# Printing the results
class_balance = np.mean(labels == labels[0])
# Using GridSearchCV to find the best values for C and gamma
C_range = 10.0**np.arange(-4, 4)
gamma_range = 10.0**np.arange(-10, 1)
param_grid = dict(gamma=gamma_range, C=C_range)
skf = cv.StratifiedKFold(y=y_train, n_folds=3)
grid = GridSearchCV(svm.SVC(), param_grid=param_grid, cv=skf)
grid.fit(X_train, y_train)

# Print out parameters
crossclf = svm.SVC(probability=True, **grid.best_params_)
y_pred = crossclf.fit(X_train, y_train).predict(X_test)
print crossclf
print 'y_pred: ', y_train
print 'y_pred: ', y_pred
print "Best parameter", grid.best_params_  # {'C': 10.0, 'gamma': 0.001}
print "Cross-Validation score", cv.cross_val_score(crossclf, X_train,
                                                   y_train).mean()
print "Independent accuracy score", accuracy_score(y_test, y_pred)
print "Independent precision score", precision_score(y_test, y_pred)
print "Independent recall score", recall_score(y_test, y_pred)
print "Independent f1 score", f1_score(y_test, y_pred)

## 7.0 Plot ROC curve
# Compute roc and auc
print("Step 7")
probas_ = crossclf.predict_proba(X_test)
print probas_
y_test[y_test == 2] = 0
fpr, tpr, thresholds = roc_curve(y_test, probas_[:, 1])
print fpr, tpr
roc_auc = auc(fpr, tpr)
print "Area under the curve", roc_auc
Beispiel #39
0
        name_text_map[line[0]] = line[4]
        name_type_map[line[0]] = line[2]
        line = f.readline().split(",")

success = 0
count = 0
with open("result/resultSVM" + str(int(datetime.now().timestamp())) + ".txt",
          "w",
          encoding="utf-8") as f:
    for n in name_text_map:
        predicted = idx_type_map[clf.predict(w2v.vectorize(
            name_text_map[n]))[0]]
        print(n + " : " + name_type_map[n] + " -> " + predicted)
        f.write(n + " : " + name_type_map[n] + " -> " + predicted + "\n")
        count = count + 1
        if predicted == name_type_map[n]:
            success = success + 1

    print(str(success) + "/" + str(count))
    f.write(str(success) + "/" + str(count) + "\n")

# leave-1-out で検定してみよう
scores = cross_validation.cross_val_score(
    clf,
    datas,
    labels,
    cv=5,
)
print("scores: " + str(scores))
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
Beispiel #40
0
def main():
    # load Titanic dataset
    titanic = load_data("titanic_train.csv", header=1, predict_col=0)
    X = titanic.X
    Xnames = titanic.Xnames
    y = titanic.y
    yname = titanic.yname
    n, d = X.shape  # n = number of examples, d =  number of features

    #========================================
    # part a: plot histograms of each feature
    '''
    print('Plotting...')
    for i in range(d) :
        plot_histogram(X[:,i], y, Xname=Xnames[i], yname=yname)
    '''

    #========================================
    # train Majority Vote classifier on data
    print('Classifying using Majority Vote...')
    clf = MajorityVoteClassifier(
    )  # create MajorityVote classifier, which includes all model parameters
    clf.fit(X, y)  # fit training data using the classifier
    y_pred = clf.predict(
        X)  # take the classifier and run it on the training data
    train_error = 1 - metrics.accuracy_score(y, y_pred, normalize=True)
    print('\t-- training error: %.3f' % train_error)

    ### ========== TODO : START ========== ###
    # part b: evaluate training error of Random classifier
    print('Classifying using Random...')
    clfRand = RandomClassifier()
    clfRand.fit(X, y)
    y_predRand = clfRand.predict(X)
    train_errorRand = 1 - metrics.accuracy_score(y, y_predRand, normalize=True)
    print('\t-- training error for random: %.3f' % train_errorRand)

    ### ========== TODO : END ========== ###

    ### ========== TODO : START ========== ###
    # part c: evaluate training error of Decision Tree classifier
    # use criterion of "entropy" for Information gain
    print('Classifying using Decision Tree...')
    clf_tree = DecisionTreeClassifier(criterion="entropy")
    clf_tree = clf_tree.fit(X, y)
    y_pred_tree = clf_tree.predict(X)
    train_error_tree = 1 - metrics.accuracy_score(
        y, y_pred_tree, normalize=True)
    print('\t-- training error for decision tree: %.3f' % train_error_tree)
    ### ========== TODO : END ========== ###

    # note: uncomment out the following lines to output the Decision Tree graph

    # save the classifier -- requires GraphViz and pydot
    '''
    from pydot import graph_from_dot_data
    from io import StringIO
    from sklearn import tree
    dot_data = StringIO()
    tree.export_graphviz(clf, out_file=dot_data,
                         feature_names=Xnames)
    graph = graph_from_dot_data(dot_data.getvalue())
    graph.write_pdf("dtree.pdf") 
    '''

    ### ========== TODO : START ========== ###
    # part d: evaluate training error of k-Nearest Neighbors classifier
    # use k = 3, 5, 7 for n_neighbors
    print('Classifying using k-Nearest Neighbors...')
    clf_neigh3 = KNeighborsClassifier(n_neighbors=3)
    clf_neigh3.fit(X, y)
    y_neigh3 = clf_neigh3.predict(X)
    train_error_neigh3 = 1 - metrics.accuracy_score(
        y, y_neigh3, normalize=True)
    print('\t-- training error for 3 nearest neighbors: %.3f' %
          train_error_neigh3)

    clf_neigh5 = KNeighborsClassifier(n_neighbors=5)
    clf_neigh5.fit(X, y)
    y_neigh5 = clf_neigh5.predict(X)
    train_error_neigh5 = 1 - metrics.accuracy_score(
        y, y_neigh5, normalize=True)
    print('\t-- training error for 5 nearest neighbors: %.3f' %
          train_error_neigh5)

    clf_neigh7 = KNeighborsClassifier(n_neighbors=7)
    clf_neigh7.fit(X, y)
    y_neigh7 = clf_neigh7.predict(X)
    train_error_neigh7 = 1 - metrics.accuracy_score(
        y, y_neigh7, normalize=True)
    print('\t-- training error for 7 nearest neighbors: %.3f' %
          train_error_neigh7)
    ### ========== TODO : END ========== ###

    ### ========== TODO : START ========== ###
    # part e: use cross-validation to compute average training and test error of classifiers
    print('Investigating various classifiers...')
    maj_clf = MajorityVoteClassifier()
    rand_clf = RandomClassifier()
    tree_clf = DecisionTreeClassifier(criterion="entropy")
    knn_clf = KNeighborsClassifier(n_neighbors=5)

    maj_train_err, maj_test_err = error(maj_clf, X, y)
    print('\t-- Average training error for majority: %.3f' % maj_train_err)
    print('\t-- Average test error for majority: %.3f' % maj_test_err)
    rand_train_err, rand_test_err = error(rand_clf, X, y)
    print('\t-- Average training error for random: %.3f' % rand_train_err)
    print('\t-- Average test error for random: %.3f' % rand_test_err)
    tree_train_err, tree_test_err = error(tree_clf, X, y)
    print('\t-- Average training error for decision tree: %.3f' %
          tree_train_err)
    print('\t-- Average test error for decision tree: %.3f' % tree_test_err)
    knn_train_err, knn_test_err = error(knn_clf, X, y)
    print('\t-- Average training error for 5 nearest neighbors: %.3f' %
          knn_train_err)
    print('\t-- Average test error for 5 nearest neighbors: %.3f' %
          knn_test_err)
    ### ========== TODO : END ========== ###

    ### ========== TODO : START ========== ###
    # part f: use 10-fold cross-validation to find the best value of k for k-Nearest Neighbors classifier
    print('Finding the best k for KNeighbors classifier...')
    plot_error = []
    plot_neigh = []
    for i in range(1, 50, 2):
        clf_knn = KNeighborsClassifier(n_neighbors=i)
        err = 1 - np.mean(cross_val_score(clf_knn, X, y, cv=10))
        plot_neigh.append(i)
        plot_error.append(err)
    # plt.plot(plot_neigh, plot_error, marker='o')
    # plt.ylabel('validation error')
    # plt.xlabel('# of neighbors')
    # plt.savefig("crossVal.pdf")
    ### ========== TODO : END ========== ###
    '''
    ### ========== TODO : START ========== ###
    # part g: investigate decision tree classifier with various depths
    print('Investigating depths...')
    plot_train_err = []
    plot_test_err = []
    plot_tree = []
    for i in range(1, 21, 1):
        train_err, test_err = error(DecisionTreeClassifier(criterion="entropy", max_depth=i), X, y)
        print(i, " ", test_err)
        plot_tree.append(i)
        plot_train_err.append(train_err)
        plot_test_err.append(test_err)

    red_patch = mpl.patches.Patch(color='red', label='training error')
    green_patch = mpl.patches.Patch(color='blue', label='test error')
    plt.plot(plot_tree, plot_train_err, 'r', plot_tree,  plot_test_err, marker='.')
    plt.ylabel('average error')
    plt.xlabel('max depth of tree')
    plt.legend(handles=[red_patch, green_patch])
    plt.savefig("decisionTreeVal.pdf")
    
    ### ========== TODO : END ========== ###
    '''

    ### ========== TODO : START ========== ###
    # part h: investigate Decision Tree and k-Nearest Neighbors classifier with various training set sizes
    print('Investigating training set sizes...')
    plt_knn_trerr = []
    plt_knn_tserr = []
    plt_tree_trerr = []
    plt_tree_tserr = []
    plt_amt_training = []
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.1,
                                                        random_state=1234)
    for i in [9, 8, 7, 6, 5, 4, 3, 2, 1, 0]:
        sum1 = 0
        sum2 = 0
        sum3 = 0
        sum4 = 0
        for j in range(100):
            h_knn_clf = KNeighborsClassifier(n_neighbors=7)
            h_tree_clf = DecisionTreeClassifier(criterion="entropy",
                                                max_depth=6)
            X_tr, X_ts, y_tr, y_ts = train_test_split(X_train,
                                                      y_train,
                                                      test_size=(i * 0.1))
            h_knn_clf.fit(X_tr, y_tr)
            y_tr_pred_knn = h_knn_clf.predict(X_tr)
            y_ts_pred_knn = h_knn_clf.predict(X_test)
            h_tree_clf.fit(X_tr, y_tr)
            y_tr_pred_tree = h_tree_clf.predict(X_tr)
            y_ts_pred_tree = h_tree_clf.predict(X_test)
            sum1 += (
                1 -
                metrics.accuracy_score(y_test, y_ts_pred_knn, normalize=True))
            sum2 += (
                1 -
                metrics.accuracy_score(y_tr, y_tr_pred_knn, normalize=True))
            sum3 += (
                1 -
                metrics.accuracy_score(y_tr, y_tr_pred_tree, normalize=True))
            sum4 += (
                1 -
                metrics.accuracy_score(y_test, y_ts_pred_tree, normalize=True))
        plt_knn_tserr.append(sum1 / 100)
        plt_knn_trerr.append(sum2 / 100)
        plt_tree_trerr.append(sum3 / 100)
        plt_tree_tserr.append(sum4 / 100)
        plt_amt_training.append(1 - (i / 10))

    red_line = mpl.lines.Line2D(plt_amt_training,
                                plt_knn_trerr,
                                color='red',
                                label='KNN training Error',
                                marker='.')
    blue_line = mpl.lines.Line2D(plt_amt_training,
                                 plt_knn_tserr,
                                 color='blue',
                                 label='KNN test Error',
                                 marker='.')
    green_line = mpl.lines.Line2D(plt_amt_training,
                                  plt_tree_trerr,
                                  color='green',
                                  label='Decision Tree training Error',
                                  marker='1')
    purple_line = mpl.lines.Line2D(plt_amt_training,
                                   plt_tree_tserr,
                                   color='purple',
                                   label='Decision Tree test Error',
                                   marker='1')

    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.add_line(red_line)
    ax.add_line(blue_line)
    ax.add_line(green_line)
    ax.add_line(purple_line)
    ax.set_xlim(0, 1.1)
    ax.set_ylim(0, 0.3)
    plt.ylabel('error')
    plt.xlabel('fraction of 90% used to train')
    plt.legend(handles=[red_line, blue_line, green_line, purple_line])
    fig.savefig("h.pdf")

    ### ========== TODO : END ========== ###

    print('Done')
Beispiel #41
0
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn import cross_validation

samples = []
labels = []

with open('../data/susy_10000_dense.csv.train') as csvdata:
    for line in csvdata:
        vector = line.split(',')
        labels.append(int(vector[-1]))
        sample = [float(feature) for feature in vector[:-1]]
        samples.append(sample)

testsamples = []
testlabels = []
with open('../data/susy_10000_dense.csv.test') as csvdata:
    for line in csvdata:
        vector = line.split(',')
        testlabels.append(int(vector[-1]))
        sample = [float(feature) for feature in vector[:-1]]
        testsamples.append(sample)

model = LogisticRegression()
model.fit(samples, labels)
print accuracy_score(model.predict(testsamples), testlabels)

print cross_validation.cross_val_score(model, samples, labels)
Beispiel #42
0
lr=linear_model.LogisticRegression(random_state=13)
clf_grid_search=grid_search.GridSearchCV(lr,params,verbose=0,scoring='roc_auc')
clf_grid_search.fit(model.X,model.Y)

best_params=clf_grid_search.best_params_

print('格点搜索得到最优参数:',best_params)

#%%交叉验证获取平均得分
lr=linear_model.LogisticRegression(random_state=13)
best_params={'C': 0.1, 'penalty': 'l2', 'class_weight': 'balanced'}
lr.set_params(**best_params)
ks_scoring=metrics.make_scorer(ModelEvaluate.cal_ks,needs_proba=True,return_split=False,decimals=4)

print('五折交叉验证AUC值:',cross_validation.cross_val_score(lr,model.X,model.Y,scoring='roc_auc',cv=5))
print('五折交叉验证KS值:',cross_validation.cross_val_score(lr,model.X,model.Y,scoring=ks_scoring,cv=5))

#%%最终模型效果评估
lr.fit(model.X,model.Y)

score=lr.predict_proba(model.X)[:,0]
score=pd.Series(score)
score.index=model.Y.index

y_pred=lr.predict(model.X)
y_pred=pd.Series(y_pred)
y_pred.index=model.Y.index

result_ks=ModelEvaluate.plot_ks_cdf(model.Y,score,decimals=4,close=False)
Beispiel #43
0
print('Included features are: %s' % features)
sys.stdout.flush()
rfr = RandomForestRegressor(n_estimators=n_estimators,
                            n_jobs=n_jobs,
                            min_samples_leaf=msl,
                            min_samples_split=mss,
                            verbose=1)

do_sklearn_cv = False
if do_sklearn_cv:
    X = train[features].values
    y = train['elo']
    msg("CROSS VALIDATING")
    cvs = cross_val_score(rfr,
                          X,
                          y,
                          cv=n_cv_groups,
                          n_jobs=n_jobs,
                          scoring='mean_absolute_error')
    print(cvs, np.mean(cvs))
    sys.stdout.flush()

do_semimanual_cv = False
if do_semimanual_cv:
    msg("fold")
    kf = KFold(train.shape[0], n_folds=n_cv_groups, shuffle=True)
    ins = []
    outs = []
    for train_index, test_index in kf:
        msg("fit")
        foo = rfr.fit(train.iloc[train_index][features],
                      train.iloc[train_index]['elo'])
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
# load data
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(url, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
# create feature union
features = []
features.append(('pca', PCA(n_components=3)))
features.append(('select_best', SelectKBest(k=6)))
feature_union = FeatureUnion(features)
# create pipeline
estimators = []
estimators.append(('feature_union', feature_union))
estimators.append(('logistic', LogisticRegression()))
model = Pipeline(estimators)
# evaluate pipeline
num_folds = 10
num_instances = len(X)
seed = 7
kfold = KFold(n=num_instances, n_folds=num_folds, random_state=seed)
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())
#replace the null data with -99999
df_features.replace(np.nan,-99999,inplace=True)

df_occ_slot = pd.DataFrame()
df_occ_slot["occupancy"] = pd.Series(occupancy_slot_list)
         
#define features(X) and labels(y)
X = np.array(df_features)
Y = np.array(df_occ_slot["occupancy"])

coef_importance = clf.feature_importances_
norm_coef_importance = [100*float(i)/sum(coef_importance) for i in coef_importance]
print "norm_coef_importance:"
print norm_coef_importance

y = clf.predict(X) 
cnf_matrix = confusion_matrix(Y, y)
# plot unnormalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=["unoccupied", "Occupied"], title='unnormalization confusion matrix (Use house#02 Random Forest Model to predict house#03)')
# Plot normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=["unoccupied", "Occupied"], normalize=True, title='Normalized confusion matrix (Use house#02 Random Forest Model to predict house#03)')

scores = cross_validation.cross_val_score(clf,X,Y,cv=10)
f4_MLP = f1_score(Y, y, average='micro')
print("scores by RandomForestClassifier: ")
print("score4.mean: "+str(np.mean(scores)) + "score4.var: " + str(np.var(scores)) )
print ("f1 score for RandomForest:")
print f4_MLP
import pandas
from sklearn import cross_validation
from sklearn.linear_model import ElasticNet
url = "https://goo.gl/sXleFv"
names = [
    'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
    'PTRATIO', 'B', 'LSTAT', 'MEDV'
]
dataframe = pandas.read_csv(url, delim_whitespace=True, names=names)
array = dataframe.values
X = array[:, 0:13]
Y = array[:, 13]
num_folds = 10
num_instances = len(X)
seed = 7
kfold = cross_validation.KFold(n=num_instances,
                               n_folds=num_folds,
                               random_state=seed)
model = ElasticNet()
scoring = 'mean_squared_error'
results = cross_validation.cross_val_score(model,
                                           X,
                                           Y,
                                           cv=kfold,
                                           scoring=scoring)
print(results.mean())  # -*- coding: utf-8 -*-
def log_regr(dataframe):
    df = dataframe.copy()
    # Выводим первые 5 строк из df
    print(df.head())
    y = df.radiant_win

    # Избавляемся от полей, отсутствующих в тестовой выборке
    delete_cols = [
        'radiant_win', 'duration', 'tower_status_radiant', 'tower_status_dire',
        'barracks_status_radiant', 'barracks_status_dire'
    ]
    df.drop(delete_cols, inplace=True, axis=1)

    # Проверяем есть ли пропуски в данных
    rows = len(df)
    data_with_skip = df.count()[df.count() != rows]
    print('Rows with empty values:')
    print(data_with_skip)

    # Заполняем пропуски
    df = df.fillna(0)
    X = df

    # Delete categorical columns
    del_list = [
        'lobby_type', 'r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero',
        'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero'
    ]
    df.drop(del_list, inplace=True, axis=1)
    X = df

    # Scaler
    scaler = StandardScaler()
    scaler.fit(X)
    X = scaler.transform(X)

    kf = KFold(y.size, n_folds=5, shuffle=True)
    c = 0.01
    clf = LogisticRegression(penalty='l2', C=c)
    clf.fit(X, y)
    start_time = datetime.datetime.now()
    scores = cross_validation.cross_val_score(clf,
                                              X,
                                              y,
                                              cv=kf,
                                              scoring='roc_auc')
    print 'Time elapsed:', datetime.datetime.now() - start_time
    mean = scores.mean()
    print(c, mean)

    # Find unique players
    data = pd.read_csv('data/data/features.csv', index_col='match_id')
    heroes_cols_list = [
        'r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero', 'd1_hero',
        'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero'
    ]
    uniq_heroes_list = set()
    for row in heroes_cols_list:
        for id in data[row].unique():
            uniq_heroes_list.add(id)
    print(uniq_heroes_list)
    print('Number of uniq heroes:{}'.format(len(uniq_heroes_list)))

    # Coding information about heroes
    # N — количество различных героев в выборке
    N = 113  # count heroes in heroes.csv
    X_pick = np.zeros((df.shape[0], N))

    for i, match_id in enumerate(data.index):
        for p in xrange(5):
            X_pick[i, data.ix[match_id, 'r%d_hero' % (p + 1)] - 1] = 1
            X_pick[i, data.ix[match_id, 'd%d_hero' % (p + 1)] - 1] = -1

    X_2 = np.concatenate([X, X_pick], axis=1)

    # Scaler
    scaler = StandardScaler()
    scaler.fit(X_2)
    X_2 = scaler.transform(X_2)

    kf = KFold(y.size, n_folds=5, shuffle=True)
    c = 0.01
    clf = LogisticRegression(penalty='l2', C=c)
    clf.fit(X_2, y)
    start_time = datetime.datetime.now()
    scores = cross_validation.cross_val_score(clf,
                                              X_2,
                                              y,
                                              cv=kf,
                                              scoring='roc_auc')
    print 'Time elapsed:', datetime.datetime.now() - start_time
    mean = scores.mean()
    print(c, mean)
Beispiel #48
0
SC_X = StandardScaler()
X_Train = SC_X.fit_transform(X_Train)
X_Test = SC_X.fit_transform(X_Test)

# Fitting SVC
SVC_Model = SVC(kernel='rbf', random_state=0)
SVC_Model.fit(X_Train, Y_Train)

# Predicting the result sets
Y_Pred = SVC_Model.predict(X_Test)

# Confusion Matrix
cm = confusion_matrix(Y_Test, Y_Pred)

# Applying K-Fold Cross Validation
accuracies = cross_val_score(estimator=SVC_Model, X=X_Train, y=Y_Train, cv=10)
accuracies.mean()
accuracies.std()

# Applying grid search to find best parameters

parameteres = [{
    'C': [1, 10, 100, 1000],
    'kernel': ['linear']
}, {
    'C': [1, 10, 100, 1000],
    'kernel': ['rbf'],
    'gamma': [0.5, 0.1, 0.01]
}]
GridSearch = GridSearchCV(param_grid=parameteres,
                          estimator=SVC_Model,
__author__ = 'pratapdangeti'

import numpy as np
from sklearn.datasets import load_boston
from sklearn.linear_model import SGDRegressor
from sklearn.cross_validation import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
data = load_boston()
print(data)
x_train, x_test, y_train, y_test = train_test_split(data.data, data.target)

x_scaler = StandardScaler()
y_scaler = StandardScaler()
x_train = x_scaler.fit_transform(x_train)
y_train = y_scaler.fit_transform(y_train)
x_test = x_scaler.transform(x_test)
y_test = y_scaler.transform(y_test)

regressor = SGDRegressor(loss='squared_loss')
scores = cross_val_score(regressor, x_train, y_train, cv=5)
print('Cross Validation r-squared scores:', scores)
print('Average cross validation r-squared score', np.mean(scores))
regressor.fit_transform(x_train, y_train)
print('Test set r-squared score', regressor.score(x_test, y_test))
Beispiel #50
0
# Random Forest Regressor
print "testing Regressor"
#print "size of training set: "+str(len(train_Y_R_decile))
for j in [10]:
    for i in [12]:
        #   clf = AdaBoostRegressor(n_estimators=j)
        clf = RandomForestRegressor(n_estimators=j,
                                    max_depth=i,
                                    min_samples_split=10,
                                    random_state=0)
        print "\tmax_depth= " + str(i)
        print "\tn_estimators= " + str(j)
        scores = cross_val_score(
            clf,
            X,
            Y_R_decile,
            scoring=lambda clf, X, Y: mean_squared_error(Y, clf.predict(X)))
        print "Random Forest Regressio mean cross validation score: " + str(
            scores.mean())
        print "\n\n"

        #        clf=GradientBoostingRegressor(n_estimators=100,learning_rate=i)
        clf.fit(train_X, train_Y_R_decile)
        #clf.fit(train_X,train_Y_R)
        Y_R_decile_predicted = list(clf.predict(test_X))
        MSE = mean_squared_error(Y_R_decile_predicted, test_Y_R_decile)
        print "\tMSE=" + str(MSE)
        heat_matrix = regression_acc_heat_map(Y_R_decile_predicted,
                                              test_Y_R_decile, num_buckets)
Beispiel #51
0
import numpy as np
from sklearn import datasets
from sklearn.cross_validation import KFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.cross_validation import cross_val_score
from sklearn.preprocessing import scale

data = datasets.load_boston()

x = scale(data['data'])
y = data['target']

kf = KFold(len(x), n_folds=5, shuffle=True, random_state=42)

res = {}

for i in np.linspace(1.0, 10.0, num=200):
    scores = cross_val_score(KNeighborsRegressor(n_neighbors=5,
                                                 weights='distance',
                                                 p=i,
                                                 metric='minkowski'),
                             x,
                             y,
                             scoring='mean_squared_error',
                             cv=kf)
    res[str(format(i, '.2f'))] = format(sum(scores) / 5.0, '.2f')

val = min(res.items(), key=(lambda x: x[1]))
print(val)
Beispiel #52
0
                                                    y,
                                                    test_size=.25,
                                                    random_state=1)

# Take a look at the shape
print('Taking a look at training and testing data shape')
print(x_train.shape, y_train.shape)
print('\n')

# Decision Tree Classifier
print('Decision Tree Classifier')
clf = DecisionTreeClassifier(random_state=1)

# Run 10 fold cross validation
print('Run 10 fold cross validation')
cvs = cross_val_score(clf, x, y, cv=5)
print(cvs)

# Show cross validation score mean and std
print('Show cross validation score mean and std')
print("Accuracy: %0.4f (+/- %0.4f)" % (cvs.mean(), cvs.std() * 2))

# Fit the model with data
clf.fit(x_train, y_train)

# Accuracy
acc_decision_tree = round(clf.score(x_train, y_train), 4)
print("Accuracy: %0.4f" % (acc_decision_tree))

# Predict y given validation set
print('Predict y given validation set')
Beispiel #53
0
def run_rf(data, lebals):
    clf = RandomForestClassifier(n_estimators=10)
    scores = cross_val_score(clf, data, lebals)
    print(scores.mean())
Beispiel #54
0
# Now we switch to scikit learn
# We set the inverse regularizer, C, to infinity to make sure we're doing MLE
#http://stackoverflow.com/questions/24924755/logit-estimator-in-statsmodels-and-sklearn

model = LogisticRegression(fit_intercept=False, C=1e9)
y = np.ravel(y)
model = model.fit(X, y)
coef_patsy = np.ravel(model.coef_)
pd.DataFrame(list(zip(X.columns, [round(c, 3) for c in coef_patsy])))

#
# evaluate the model using 10-fold cross-validation
scores = cross_val_score(LogisticRegression(fit_intercept=False, C=1e9),
                         X,
                         y,
                         scoring='accuracy',
                         cv=10)
print(scores)
#[ 0.805  0.61   0.732  0.725  0.675  0.7    0.7    0.692  0.744  0.667]

print('average CV accuracy = {0:.2f}'.format(scores.mean()))  #0.70
print('baseline accuracy = {0:.2f}'.format(1 - y.mean()))  # 0.68

# Split data into train and test sets.
# We first shuffle the order of the rows (although this is done inside the
# train_test_split function, so is not strictly neccessary).
np.random.seed(42)  # ensure reproducibility
#X = pd.DataFrame(np.random.randn(5,2))
#y = np.random.rand(5)
N = len(X)
            X_sample=samples_features_and_labels[sample]["X"]
            print "\t" +sample+": "+str(len(X_sample))+" points"
            tr_size= int(float(len(X_sample))*0.8)
            te_size= len(X_sample)-tr_size
            Y_C_sample_tr=Y_C_sample[0:tr_size]
            Y_C_sample_te=Y_C_sample[tr_size:tr_size+te_size]
            X_sample_tr=X_sample[0:tr_size]
            X_sample_te=X_sample[tr_size:tr_size+te_size]
            clf.fit(X_sample_tr,Y_C_sample_tr)
            pred=clf.predict(X_sample_te)
            accuracy=(100*(float(sum(Y_C_sample_te==pred)))/len(Y_C_sample_te))
            mean_acc.append(accuracy*(float(len(X_sample)/float(num_points))))
            print "\t"+sample+" test accuracy: "+str(accuracy)
        print "average sample accuracy:"+str(sum(mean_acc))
        '''
        scores = cross_val_score(clf, X, Y_C)
        print "RandomForest mean cross validation score: " + str(scores.mean())

#if only two dimensions are being used, we can plot two dimentionally
if False:
    # Plot decision bondary based on first two dimensions.
    def extrema(lol, index, reverse):  # list of lists
        return sorted(lol, key=operator.itemgetter(index),
                      reverse=reverse)[0][index]

    sorted_feature_importances = sorted(feature_importances, reverse=True)
    x_idx = np.where(
        feature_importances == sorted_feature_importances[0])[0][0]
    y_idx = np.where(
        feature_importances == sorted_feature_importances[1])[0][0]
    best_poly=0
    
    for j in range(0, len(poly)):    
        for times in range(0,3):
        
            results=np.zeros(len(exploreC))
            for i in range(0,len(exploreC)):

                svm_poly_model=svm.SVC(exploreC[i], kernel='poly', 
                                         degree=poly[j],shrinking=True, 
                                         probability=False , cache_size=2000, 
                                         verbose=False, max_iter=-1, 
                                         decision_function_shape='ovr' , random_state=0)
            
                scores_cv=cross_validation.cross_val_score(svm_poly_model, X, 
                                                        target_train, 
                                                        scoring='f1_weighted', cv=5, 
                                                        n_jobs=-1)
                
                results[i]=float(scores_cv.mean())
                print "run for C="+str(exploreC[i])+", poly="+str(poly[j])+" is "+str(results[i])
            print results
            best_val=np.max(results)
            print "Best result of iteration was "+str(best_val)    
            results=results.tolist()
            index_val = results.index(best_val)
            if best_val>best_err:
                best_err=best_val
                best_c=exploreC[index_val]
                best_poly=poly[j]    
            print "CV averages for values "+str(exploreC)+" are:"+str(results)
            print "Best C is"+str( exploreC[index_val])
Beispiel #57
0
    #,'Parch'
    #,'Mother'
]]
deck_train_df = all_deck_train_df.loc[all_deck_train_df['DeckId'] != 0].copy()
deck_null_df = all_deck_train_df.loc[all_deck_train_df['DeckId'] == 0].copy()
deck_target_df = deck_train_df['DeckId'].copy()
deck_train_df.drop(['DeckId'], axis=1, inplace=True)
deck_null_df.drop(['DeckId'], axis=1, inplace=True)

# Linear Regression
print 'Training Deck model...'
deck_train_model = RandomForestClassifier(n_estimators=100)

# Cross validation
scores = cross_validation.cross_val_score(deck_train_model,
                                          deck_train_df,
                                          deck_target_df,
                                          cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

# Predict
print 'Predicting Deck...'
deck_train_model = deck_train_model.fit(deck_train_df, deck_target_df)
deck_train_result = deck_train_model.predict(deck_null_df)
main_all_df.loc[main_all_df['DeckId'] == 0, 'DeckId'] = deck_train_result
print 'Done.'

plt.figure(figsize=(8, 4))
plt.title('DeckId (after)')
plt.xlabel('Value')
main_all_df['DeckId'].plot.hist()
Beispiel #58
0
    encoded_y_train = label_encoder.fit_transform(y_train)

    xgb = XGBClassifier(max_depth=args.max_depth,
                        learning_rate=args.learning_rate,
                        n_estimators=args.n_estimators,
                        objective="multi:softprob",
                        gamma=0,
                        min_child_weight=1,
                        max_delta_step=0,
                        subsample=args.subsample,
                        colsample_bytree=args.colsample_bytree,
                        colsample_bylevel=args.colsample_bylevel,
                        reg_alpha=0,
                        reg_lambda=1,
                        scale_pos_weight=1,
                        base_score=0.5,
                        missing=None,
                        silent=True,
                        nthread=-1,
                        seed=42)

    kf = KFold(len(x_train), n_folds=10, random_state=42)

    score = cross_val_score(xgb,
                            x_train,
                            encoded_y_train,
                            cv=kf,
                            scoring=ndcg_scorer)

    print(xgb.get_params(), score.mean())
    C = np.linspace(300, 5000, num = 10)[::-1]
    models = [lm.LogisticRegression(penalty = "l1", C = c) for c in C]

if modelname == "sgd": 
    C = np.linspace(0.00005, .01, num = 5)
    models = [lm.SGDClassifier(loss = "log", penalty = "l2", alpha = c, warm_start = False) for c in C]
    
if modelname == "randomforest":
    C = np.linspace(50, 300, num = 10)
    models = [RandomForestClassifier(n_estimators = int(c)) for c in C]

print "calculating cv scores"
cv_scores = [0] * len(models)
for i, model in enumerate(models):
    # for all of the models, save the cross-validation scores into the array cv_scores
    cv_scores[i] = np.mean(cross_validation.cross_val_score(model, X, y, cv=5, scoring = auc_scorer))
    #cv_scores[i] = np.mean(cross_validation.cross_val_score(model, X, y, cv=5, score_func = auc))
    print " (%d/%d) C = %f: CV = %f" % (i + 1, len(C), C[i], cv_scores[i])

# find which model and C is the best
best = cv_scores.index(max(cv_scores))
best_model = models[best]
best_cv = cv_scores[best]
best_C = C[best]
print "BEST %f: %f" % (best_C, best_cv)

print "training on full data"
# fit the best model on the full data
best_model.fit(X, y)

print "prediction"
    cross_val_data[score] = {'mean': [] , 'std': []}

roc_data = dict()
pr_data = dict()
det_data = dict()
for n in CONFIG['n_estimators']:
    roc_data[n] = dict()
    pr_data[n] = dict()
    det_data[n] = dict()


for n in CONFIG['n_estimators']: # цикл по количеству деревьев
    # кроссвалидация по разным метрикам
    model_rfc = RandomForestClassifier(n_estimators = n) #,max_depth = n)
    for score in CONFIG['scorings']: # цикл по метрикам
        values = cross_validation.cross_val_score(model_rfc, X, y, cv=kf, scoring=score)
        mean = values.mean()
        std = values.std()
        cross_val_data[score]['mean'].append(mean)
        cross_val_data[score]['std'].append(std)
        print('N estimators = %d, scoring = \'%s\', mean value = %f, std value = %f' % (n,score,mean,std))
    #обучаем сам классификатор
    proba = model_rfc.fit(X_train, y_train).predict_proba(X_test)
    # roc кривая
    fpr, tpr, thresholds = roc_curve(y_test, proba[:, 1])
    print("Y: ", y_test,"PROBA: ", proba)
    roc_auc  = auc(fpr, tpr)
    roc_data[n]['fpr'] = fpr
    roc_data[n]['tpr'] = tpr
    roc_data[n]['roc_auc'] = roc_auc
    # pr кривая