Ejemplo n.º 1
0
    'C': [1, 10, 50, 600]
}, {
    'kernel': ['poly'],
    'degree': [2, 3]
}, {
    'kernel': ['rbf'],
    'gamma': [0.01, 0.001],
    'C': [1, 10, 50, 600]
}]

metrics = ['precision', 'recall_weighted']

for metric in metrics:
    print "\n### Searching optimal hyperparameters for ", metric

    classifier = grid_search.GridSearchCV(svm.SVC(C=1),
                                          parameter_grid,
                                          cv=5,
                                          scoring=metric)
    classifier.fit(X_train, y_train)

    print '----- measure scores-------'
    print "\nScores across the parameter grid:"
    for params, avg_score, _ in classifier.grid_scores_:
        print params, '-->', round(avg_score, 3)

    print "\n Higtest scoring parameter set: ", classifier.best_params_

    y_pred = classifier.predict(X_test)
    print "\nFull performance report:\n"
    print classification_report(y_test, y_pred)
Ejemplo n.º 2
0
    csp[0:num_pair,:] = W[0:num_pair,:] # 取投影矩阵前几行
    csp[num_pair:,:] = W[np.shape(W)[1]-num_pair:,:] # 对应取投影矩阵后几行

    feat_train = feat_Generator(eegwin_0_train, eegwin_1_train)
# In[用训练集特征训练分类器]
    parameter_grid = [  {'kernel': ['linear'], 'C': [10 ** x for x in range(-1, 4)]},
                        {'kernel': ['poly'], 'degree': [2, 3]},
                        {'kernel': ['rbf'], 'gamma': [0.01, 0.001], 'C': [10 ** x for x in range(-1, 4)]},
                     ]

    feat_train_X = feat_train[:,:-1]
    feat_train_y = feat_train[:,-1]

    print("\n#### Searching optimal hyperparameters for precision")

    classifier = grid_search.GridSearchCV(svm.SVC(), 
                                          parameter_grid, cv=5, scoring="accuracy")
    classifier.fit(feat_train_X, feat_train_y)

    print("\nScores across the parameter grid:")
    for params, avg_score, _ in classifier.grid_scores_:
        print(params, '-->', round(avg_score, 4))
    print("\nHighest scoring parameter set:", classifier.best_params_)
    print("\nHighest performance in training set:", classifier.best_score_)
    train_avgacc = train_avgacc + classifier.best_score_
# In[用测试集测试分类器]
    eegwin_0_test, eegwin_1_test = task_Generator(X_test, y_test)
    feat_test = feat_Generator(eegwin_0_test, eegwin_1_test)

    feat_test_X = feat_test[:,:-1]
    feat_test_y = feat_test[:,-1]
Ejemplo n.º 3
0
    m = X.shape[0]
    rand_index = np.random.permutation(m)
    X_train = X[rand_index[:0.9 * m], :]
    X_test = X[rand_index[0.9 * m:], :]
    y_train = y[rand_index[:0.9 * m]]
    y_test = y[rand_index[0.9 * m:]]

    clf = SVC_1(C=1000, kernel='rbf')

    cv = cross_validation.KFold(X_train.shape[0], n_folds=6)

    C_vec = np.logspace(-1, 1, 10)
    param_grid = dict()
    param_grid['C'] = C_vec
    gs = grid_search.GridSearchCV(estimator=clf, param_grid=param_grid, cv=cv)
    gs.fit(X_train, np.ravel(y_train))
    print gs.best_params_
    print gs.best_estimator_.C
    print gs.best_score_

    scores = np.zeros(C_vec.shape)
    for i in range(len(C_vec)):
        for train_indices, test_indices in cv:
            print train_indices
            print test_indices
            print
            clf.set_params(C=C_vec[i])
            scores[i] = clf.fit(X_train[train_indices, :],
                                np.ravel(y_train[train_indices])).score(
                                    X_train[test_indices, :],
Ejemplo n.º 4
0
X_train = titanic_df.drop("Survived",axis=1)
Y_train = titanic_df["Survived"]
X_test  = test_df.drop("PassengerId",axis=1).copy()
print X_train

# Support Vector Machines

from sklearn import grid_search
param_range = [0.0001,0.0005,0.001, 0.01, 0.1, 1.0]
parameters = {
            'C':[1e4,1e5,1e6],
            'gamma':[0.00001,0.0001,0.0005,0.001]
            }
clf = SVC()
model = grid_search.GridSearchCV(estimator=clf,param_grid=parameters,cv=5,scoring='accuracy')
model = model.fit(X_train,Y_train)
print model.best_score_
print model.best_params_

svc = SVC(C=model.best_params_['C'],gamma=model.best_params_['gamma'])

svc.fit(X_train, Y_train)

Y_pred = svc.predict(X_test)

print Y_pred
print svc.score(X_train, Y_train)

# Random Forests
#Lets try manually selecting payment features
features_manual = ["poi", "salary", "bonus", 
                 'deferral_payments', 'total_payments', 'loan_advances', 'restricted_stock_deferred',
                 'deferred_income', 'total_stock_value', 'expenses', 'exercised_stock_options',
                 'long_term_incentive', 'restricted_stock', 'director_fees']
features_train, features_test, labels_train, labels_test = train_test_data(features_manual)

clf4 = DecisionTreeClassifier()
print '*** Decesiontree Algorithm with only payments features ***'
print test_algorithm(clf4,features_train,features_test), '\n'

## Parameter tuning
# Lets repeat 50%classifier with different classifier parameters to see 
# if we can achieve better result with any other parameter in algorithm
features_list = ["poi", "salary", "bonus", "fraction_from_poi_email", "fraction_to_poi_email",
                 'total_payments', 'total_stock_value', 'expenses', 'exercised_stock_options',
                 'shared_receipt_with_poi', 'restricted_stock']
features_train, features_test, labels_train, labels_test = train_test_data(features_list)

parameters = {'criterion':('gini', 'entropy')}
dtc = DecisionTreeClassifier()
clf5 = grid_search.GridSearchCV(dtc, parameters)
print 'Run Decesion Tree classifier with GridSearchCV'
print test_algorithm(clf5,features_train,features_test), '\n'
	
### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.
dump_classifier_and_data(clf0, my_dataset, features_li)
Ejemplo n.º 6
0
# In[]
#eeg_data = sio.loadmat('CutedEEG.mat')['CutedEEG']
#gait_data = gait_mat_data['FilteredMotion'][0] # 每个元素是受试者走的一次trail;每个trail记录双膝角度轨迹,依次是右膝和左膝
feats_all = sio.loadmat('features.mat')['features']

parameter_grid = [  {'kernel': ['linear'], 'C': [10 ** x for x in range(-1, 4)]},
                    {'kernel': ['poly'], 'degree': [2, 3]},
                    {'kernel': ['rbf'], 'gamma': [0.01, 0.001], 'C': [10 ** x for x in range(-1, 4)]},
                 ]

X = feats_all[:,:-1]
y = feats_all[:,-1]

print("\n#### Searching optimal hyperparameters for precision")
classifier = grid_search.GridSearchCV(svm.SVC(), 
            parameter_grid, cv=5, scoring='precision_weighted')
classifier.fit(X, y) # 直接用实时收集到的数据进行训练,不把数据分出测试集了,直接用在线数据进行测试

print("\nScores across the parameter grid:")
for params, avg_score, _ in classifier.grid_scores_:
    print(params, '-->', round(avg_score, 3))
print("\nHighest scoring parameter set:", classifier.best_params_)

#joblib.dump(classifier, time.strftime('%Y_%m_%d_%H_%M_%S',time.localtime(time.time()))+"_SVM.m") # 按当前时间命名保存训练好的分类器
joblib.dump(classifier, "SVM.m") # 保存训练好的分类器

# In[]
#max_accuracy = 0
#count = 10.0 # 随机计算准确率的次数
#num_feats = len(feats_all)
#ave_accuracy, ave_f1, ave_precision, ave_recall = [],[],[],[]
# Define the parameter grid
parameter_grid = [{
    'n_estimators': [100],
    'max_depth': [2, 4, 7, 12, 16]
}, {
    'max_depth': [4],
    'n_estimators': [25, 50, 100, 250]
}]

metrics = ['precision_weighted', 'recall_weighted']

for metric in metrics:
    print("\n##### Searching optimal parameters for", metric)

    classifier = grid_search.GridSearchCV(ExtraTreesClassifier(random_state=0),
                                          parameter_grid,
                                          cv=5,
                                          scoring=metric)
    classifier.fit(X_train, y_train)

    print("\nGrid scores for the parameter grid:")
    for params, avg_score, _ in classifier.grid_scores_:
        print(params, '-->', round(avg_score, 3))

    print("\nBest parameters:", classifier.best_params_)

    y_pred = classifier.predict(X_test)
    print("\nPerformance report:\n")
    print(classification_report(y_test, y_pred))
Ejemplo n.º 8
0
                                               y_train,
                                               cv=5,
                                               scoring='roc_auc')
print('score_cross:', round(np.mean(scores_cross), 5), 'std:',
      round(np.std(scores_cross), 5))

# grid search on max_depth and min_child_weight
param_test1 = {'max_depth': [3, 5, 7, 9], 'min_child_weight': [1, 3, 5]}
gsearch1 = grid_search.GridSearchCV(estimator=XGBClassifier(
    learning_rate=0.1,
    n_estimators=424,
    max_depth=5,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=314),
                                    param_grid=param_test1,
                                    scoring='roc_auc',
                                    iid=False,
                                    cv=5)
gsearch1.fit(X_train, y_train)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

param_test2 = {'max_depth': [6, 7, 8], 'min_child_weight': [4, 5, 6]}
gsearch2 = grid_search.GridSearchCV(estimator=XGBClassifier(
    learning_rate=0.1,
    n_estimators=424,
    max_depth=7,
Ejemplo n.º 9
0
#features = df_results.drop('result', axis=1).fillna(value=np.finfo(np.float32).min + 1).values
#features = df_results.drop('result', axis=1).fillna(method='ffill').fillna(method='bfill').values
features = df_results.drop('result', axis=1).fillna(value=0).values
scaled_features = scale(features, axis=0)
target = df_results['result'].values



kf = KFold(scaled_features.shape[0], n_folds=5, shuffle=True, random_state=42)

Cs = [10**x for x in range(-5, 6)]

lr = LogisticRegression()

clf = grid_search.GridSearchCV(estimator=lr, param_grid=dict(C=Cs), n_jobs=4, cv=kf, scoring='roc_auc')
clf.fit(scaled_features, target)


df_lines_test = pd.read_sql('''select match_ref, l.house_ref
  , MAX(CASE WHEN l.is_it_starting = 1 THEN l.line_value END) start_value
  , MAX(CASE WHEN l.is_it_starting = 0 THEN l.line_value END) next_value
  --, MAX(CASE WHEN l.is_it_starting = 0 THEN l.line_increment END) line_increment
from Lines l 
where 1=1
--and match_ref = 1754 
and TS_ref = 1880
and RTV_Ref = 1 
GROUP BY match_ref, l.house_ref
order by match_ref, l.house_ref''', conn)
Ejemplo n.º 10
0
            ],
            transformer_weights={
                'cst': 1.0,
                'txt1': 0.5,
                'txt2': 0.25,
                'txt3': 0.0,
                'txt4': 0.5
            },
            #n_jobs = -1
        )),
    ('rfr', rfr)
])
param_grid = {'rfr__max_features': [10], 'rfr__max_depth': [20]}
model = grid_search.GridSearchCV(estimator=clf,
                                 param_grid=param_grid,
                                 n_jobs=1,
                                 cv=2,
                                 verbose=20,
                                 scoring=RMSE)

t0 = time()
print "Begin training"
model.fit(X_train, y_train)
t1 = time()
print "Training complete: ", t1 - t0, "s"

print("Best parameters found by grid search:")
print(model.best_params_)
print("Best CV score:")
print(model.best_score_)
print(model.best_score_ + 0.47003199274)
Ejemplo n.º 11
0
    #print(score2)
    del pred
    del y_test_array
    del score
    #pred_test=clf.predict_proba(test.drop(['id'], axis=1))
    #return pred_test
    
#grid search
from sklearn import grid_search

paramRF = {'n_estimators':[100], 'criterion':('gini', 'entropy'), 'max_depth':[3,4,5,10,15,20]}
paramET = {'n_estimators':[100], 'criterion':('gini', 'entropy'), 'max_depth':[3,4,5,10,15,20]}
paramXG = {'n_estimators':[100], 'learning_rate':[0.1], 'reg_alpha':[0],'colsample_bytree':[0.1],'colsample_bylevel':[0.1],'max_depth':[5]}
param_DT = {'criterion':('gini', 'entropy'), 'max_depth':[3,4,5,10,15,20], 'max_features':[None,'auto','sqrt']}
# 'reg_alpha':[0.2,0.3,0.5,0.7], 'reg_lambda':[0,1,5,10]
clfRF = grid_search.GridSearchCV(RandomForestClassifier() , paramRF, cv=2, scoring='log_loss')
clfRF.fit( x_train , y_train )
clfRF.best_estimator_
log_loss(y_test, clfRF.predict_proba(x_test))

clfET = grid_search.GridSearchCV(ExtraTreesClassifier() , paramET, cv=2)
clfET.fit( x_train , y_train )
clfET.best_estimator_
log_loss(y_test, clfET.predict_proba(x_test))

clfXG = grid_search.GridSearchCV(xgb.XGBClassifier() , paramXG, cv=2, scoring='log_loss')
clfXG.fit( x_train , y_train )
clfXG.best_estimator_
log_loss(y_test, clfXG.predict_proba(x_test))

clfDT = grid_search.GridSearchCV(DecisionTreeClassifier() , param_DT, cv=2)
Ejemplo n.º 12
0
### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

from sklearn.cross_validation import train_test_split
features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.3, random_state=42)

# from sklearn.pipeline import Pipeline
from sklearn import svm, grid_search
svr = svm.SVC()
# change default gamma to 1/n_features
parameters = {'kernel':('linear', 'rbf', 'poly'), 'C':[1, 10, 100], 
              'gamma':[0.0625, 1, 10], 'degree':[4, 5, 6, 7, 8]}
clf_GridSearch = grid_search.GridSearchCV(svr, parameters, scoring='f1')
clf_GridSearch.fit(features_train, labels_train)
clf_GridSearch.best_estimator_

clf = svm.SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=4, gamma=0.0625, kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)



### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.
clf.fit(X_train, y_train)
preds = clf.predict(X_test)
print("Accuracy score on test data on first run")
print(accuracy_score(y_test, preds))
print("F-score")
print(fbeta_score(y_test, preds, beta=0.5))

# In[250]:

# trying optimization with grid_search
from sklearn import tree, grid_search
from sklearn.metrics import fbeta_score, make_scorer, accuracy_score

scorer = make_scorer(fbeta_score, beta=0.5)
parameters = {'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}
grid_obj = grid_search.GridSearchCV(clf, parameters, verbose=1, scoring=scorer)
#grid_fit = grid_obj.fit(X_train, y_train)

#best_clf = grid_fit.best_estimator_
#best_predictions = best_clf.predict(X_test)
#print("Best Clf's Accuracy score on test data on first run")
#print(accuracy_score(y_test, best_predictions))
#print("Best Clf's F-score")
#print(fbeta_score(y_test, best_predictions, beta=0.5))

# In[251]:

Y_pred = clf.predict(submission_samples)
submission = pd.DataFrame({
    "PassengerId": test_df["PassengerId"],
    "Survived": Y_pred
Ejemplo n.º 14
0
param_grid = [
    {
        'C': [0.01, 0.1, 1, 10, 100, 1000],
        'kernel': ['linear']
    },
    {
        'C': [0.01, 0.1, 1, 10, 100, 1000],
        'gamma': [0.001, 0.0001],
        'kernel': ['rbf']
    },
]

GRIDSEARCH = False
if GRIDSEARCH == True:
    clf = grid_search.GridSearchCV(svm.SVC(), param_grid, verbose=10)
    clf.fit(X, y)

    with open('best_estimator', 'wb') as f:
        cPickle.dump(clf.best_estimator_, f)

else:
    cv = cross_validation.ShuffleSplit(len(y), n_iter=1, test_size=0.2)
    for train, test in cv:
        train_X = X[train]
        train_y = y[train]

        test_X = X[test]
        test_y = y[test]
        test_image = images[test]
Ejemplo n.º 15
0
    train = train.astype(float)
    test = test.astype(float)

    #0.614773724081
    #tune parameters
    #'max_features': 'sqrt', 'min_samples_split': 5, 'learning_rate': 0.2, 'n_estimators': 100, 'max_depth': 6}

    gbm = ensemble.GradientBoostingClassifier(random_state=42)
    params = [{
        'n_estimators': [75, 100, 125],
        'min_samples_split': [5, 10],
        'max_depth': [6, 8],
        'max_features': ['sqrt'],
        'learning_rate': [0.2]
    }]
    clf = grid_search.GridSearchCV(gbm, params, verbose=1, n_jobs=-1)

    # cross validation
    print("k-Fold RMSLE:")
    cv_rmsle = cross_validation.cross_val_score(clf, train, y, scoring='f1')
    print(cv_rmsle)

    print("Mean: " + str(cv_rmsle.mean()))

    # get predictions on test
    clf.fit(train, y)

    # get predictions from the model, convert them and dump them!
    preds = clf.predict(test)
    preds = pd.DataFrame({"'Search ID'": id, "cost": preds})
Ejemplo n.º 16
0
import numpy as np
from sklearn import datasets
from sklearn.svm import SVC
from sklearn.cross_validation import KFold
from sklearn import grid_search
from sklearn.feature_extraction.text import TfidfVectorizer
newsgroups = datasets.fetch_20newsgroups(
    subset='all', categories=['alt.atheism', 'sci.space'])
vectorizer = TfidfVectorizer()
data = vectorizer.fit_transform(newsgroups.data)
features = data
true = newsgroups.target
grid = {'C': np.power(10.0, np.arange(-5, 6))}
cv = KFold(true.size, n_folds=5, shuffle=True, random_state=241)
clf = SVC(kernel='linear', random_state=241)
gs = grid_search.GridSearchCV(clf, grid, scoring='accuracy', cv=cv)
gs.fit(features, true)
C = gs.best_score_
est = gs.best_estimator_.C
print(C)
print(est)
model = SVC(C=est, kernel='linear', random_state=241)
model.fit(features, true)
coef0 = model.coef_.toarray()[0]
values = abs(coef0)
top10 = np.argsort(values)[-10:]
#coefabs = abs(model.coef_.data)
#print(coefabs)
#coefabssort = np.argsort(coefabs)[-10:]
feature_mapping = vectorizer.get_feature_names()
wr = []
Ejemplo n.º 17
0
for i, val in _data.iteritems():
    label = val['label']
    data = val['data']
    if label not in _label_data:
        _label_data[label] = []
    _label_data[label].append(data)

NUM_EXAMPLES = len(_data)
print NUM_EXAMPLES, len(_label_data)
print 'done'

# ============= Perform SVM classification =============

parameters = {'kernel': ('linear', 'rbf'), 'C': [1, 10]}
svr = svm.SVC()
clf = grid_search.GridSearchCV(svr, parameters)

X = []
Y = []
for k, v in _data.iteritems():
    X.append(np.array(v['data']).flatten())
    Y.append(v['label'])

print 'fitting...'
clf.fit(X, Y)

# ============= Make predictions =============

test_vectors = []
with open('competition_2/test.data', 'r') as test_data:
    for i, line in enumerate(test_data):
fore_rf.fit(X_train, y_train)

if fold > 0:
    print('cv: ' + str(np.mean(cross_val_score(fore_rf, X_train, y_train, cv=fold))))
else: 
    print('no cv scores')
"""

param_grid = {
    'n_estimators': [10**1, 10**4],
    'max_depth': [2, 4, 10**5],
    'min_samples_leaf': [1, 100, 1000]
}

clf = RandomForestClassifier()
grid_clf = grid_search.GridSearchCV(clf, param_grid, cv=fold, verbose=5)
grid_clf.fit(X, y)

print('best params:' + str(grid_clf.best_params_))
print('best params:' + str(grid_clf.best_score_))

# Part 3 - Making the predictions and evaluating the model
# Predicting the Test set results

totalpred = grid_clf.predict(X)
dataset3['expected'] = totalpred
#del X, y,totalpred, X_train, X_test, y_train, y_test

from sklearn.metrics import confusion_matrix

dataset3['motion_expected'] = dataset3[
Ejemplo n.º 19
0
        data = scipy.io.loadmat(file)

        ytrain = data['Ytrain'].T.reshape(data['Ytrain'].shape[1])

        x_train, x_val, y_train, y_val = cross_validation.train_test_split(
            data['Xtrain'], ytrain, test_size=0.2, random_state=0)
        tuned_parameters = [
            #{'alpha': [0.15]}
            {
                'alpha': [0.2]
            }
        ]

        print "-- TRAINNING: grid search with 5 fold cross-validation"
        clf = grid_search.GridSearchCV(BernoulliNB(),
                                       tuned_parameters,
                                       cv=10,
                                       scoring='accuracy')
        clf.fit(x_train, y_train)

        print "score : " + str(clf.best_score_)
        print "params : " + str(clf.best_params_)
        parametros.append(clf.best_params_)
        for params, mean_score, scores in clf.grid_scores_:
            print str(mean_score) + " " + str(scores) + " " + str(params)

        y_true, y_pred = y_val, clf.predict(x_val)
        score = accuracy_score(y_true, y_pred)
        total_score += score

        cm = confusion_matrix(y_true, y_pred)
        total = numpy.sum(cm, axis=1)
Ejemplo n.º 20
0
#!/usr/bin/python3

import numpy as np
import json

from sklearn import cross_validation, svm, grid_search

settings = json.loads(open('settings/grid-search_.json', 'r').read())
data = np.load('bin/train_data.npy')
labels = np.load('bin/train_labels.npy')

data = data[labels == 1]

if 'test_size' in settings.keys():
    data, _ = cross_validation.train_test_split(
        data, test_size=settings['test_size'])
    del settings['test_size']

print('Training sample shape: {}'.format(data.shape))

kernel_params = settings['params']
del settings['params']

estimator = svm.OneClassSVM()
clasificator = grid_search.GridSearchCV(estimator, kernel_params, **settings)
model = clasificator.fit(data)

print('Best params: {}'.format(str(model.best_params_)))
Ejemplo n.º 21
0
def train(training_path_a,
          training_path_b,
          training_path_c,
          training_path_d,
          training_path_e,
          print_metrics=True):
    '''Trains a classifier. training_path_a and training_path_b should be
    directory paths and each of them should not be a subdirectory of the other
    one. training_path_a and training_path_b are processed by
    process_directory().

    Args:
      training_path_a (str): directory containing sample images of class A.
      training_path_b (str): directory containing sample images of class B.
      print_metrics  (boolean, optional): if True, print statistics about
        classifier performance.

    Returns:
      A classifier (sklearn.svm.SVC).
    '''
    if not os.path.isdir(training_path_a):
        raise IOError('%s is not a directory' % training_path_a)
    if not os.path.isdir(training_path_b):
        raise IOError('%s is not a directory' % training_path_b)
    if not os.path.isdir(training_path_c):
        raise IOError('%s is not a directory' % training_path_c)
    if not os.path.isdir(training_path_d):
        raise IOError('%s is not a directory' % training_path_d)
    if not os.path.isdir(training_path_e):
        raise IOError('%s is not a directory' % training_path_e)
    training_a = process_directory(training_path_a)
    training_b = process_directory(training_path_b)
    training_c = process_directory(training_path_c)
    training_d = process_directory(training_path_d)
    training_e = process_directory(training_path_e)
    # data contains all the training data (a list of feature vectors)
    data = training_a + training_b + training_c + training_d + training_e
    # target is the list of target classes for each feature vector: a '1' for
    # class A and '0' for class B
    target = [4] * len(training_a) + [3] * len(training_b) + [2] * len(
        training_c) + [1] * len(training_d) + [0] * len(training_e)
    # split training data in a train set and a test set. The test set will
    # containt 20% of the total
    x_train, x_test, y_train, y_test = cross_validation.train_test_split(
        data, target, test_size=0.20)
    # define the parameter search space
    parameters = {
        'kernel': ['linear', 'rbf'],
        'C': [1, 10, 100, 1000],
        'gamma': [0.01, 0.001, 0.0001]
    }
    # search for the best classifier within the search space and return it
    clf = grid_search.GridSearchCV(svm.SVC(probability=True),
                                   parameters).fit(x_train, y_train)

    ### save to model local dir ###
    joblib.dump(clf, 'sport_classification.pkl')
    ###                         ###

    classifier = clf.best_estimator_
    if print_metrics:
        print()
        print('Parameters:', clf.best_params_)
        print()
        print('Best classifier score')
        print(metrics.classification_report(y_test,
                                            classifier.predict(x_test)))
    return classifier
Ejemplo n.º 22
0
featurelist = list(dataclean.columns.values)
featurelist.remove('Id')
featurelist.remove('Response')
features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(
    dataclean[featurelist],
    dataclean['Response'],
    test_size=0.1,
    random_state=42)

param_grid = {
    "criterion": ["gini", "entropy"],
    "min_samples_split": [2, 4],
    "max_depth": [None, 2, 4],
    "min_samples_leaf": [1, 3, 5],
    "class_weight": ["balanced", "balanced_subsample"]
}

modeloptimal = grid_search.GridSearchCV(estimator=RandomForestClassifier(),
                                        param_grid=param_grid,
                                        scoring='f1',
                                        cv=5)
modeloptimal.fit(features_train, labels_train)

clf = modeloptimal.best_estimator_

pred = clf.predict(features_test)

accuracy = accuracy_score(labels_test, pred)

print accuracy
Ejemplo n.º 23
0
corpusts = testdf['ingredients_string']
vectorizerts = TfidfVectorizer(stop_words='english')
tfidfts=vectorizertr.transform(corpusts)

predictors_tr = tfidftr

targets_tr = traindf['cuisine']

predictors_ts = tfidfts

# LR, SCV
classifier = LinearSVC(C=0.80, penalty="l2", dual=False)
parameters = {'C':[1, 10]}
clf = LinearSVC()
clf = LogisticRegression()
classifier = grid_search.GridSearchCV(clf, parameters)
classifier=classifier.fit(predictors_tr,targets_tr)

#decision trees
#clf = tree.DecisionTreeClassifier()
#parameters = {'max_depth':[100]}
#classifier=clf.fit(predictors_tr,targets_tr)

predictions_train = classifier.predict(predictors_tr)

predictions=classifier.predict(predictors_ts)
for i in range(0,predictions.size):
    predictions[i] = str(predictions[i])
for i in range(0,predictions_train.size):
    predictions_train[i] = str(predictions_train[i])
Ejemplo n.º 24
0
    def serialize_stem_silk(self):

        start_time = time.time()

        file_name = self.file_name  #define the variables
        gold_standard_name = self.gold_standard_name
        N = int(self.N)
        a = float(self.a)

        path_to_file = gold_standard_name  #data/your_experiment/gs/gs.csv

        path_to_file = path_to_file.split('/gs/')

        path_to_file = path_to_file[0] + '/'  #data/your_experiment/

        path_to_config_file = file_name.split('/')
        path_to_config_list = path_to_config_file[
            0:
            -1]  #the last element is the name of the file, I just want the path, config/your_experiment/config.xml

        #turn the list into a string by iterating and summing

        path_to_config = ''

        for i in path_to_config_list:
            path_to_config += i
            path_to_config += '/'

        #open files for writing

        output_file_raw = open(
            path_to_file + 'ensemble_silk_output_raw_n%d.txt' % N, 'w')

        #output_file = open('ensemble_duke_stacking_output_T2_n%d.txt' %N,'w')

        gold_standard_read = open(gold_standard_name, 'rU')

        #iterate for each tweaked configuration

        #read actual threshold

        tree = ET.parse(file_name)
        root = tree.getroot()

        for thresh in root.iter('Output'):
            central_thresh = float(thresh.attrib['minConfidence']
                                   )  #central value of the threshold

        #parsing the silk xml config file to find the name of the output file

        for k in root.iter('Output'):
            for b in k.iter('Param'):
                if b.attrib['name'] == 'file':
                    output_file_name = b.attrib['value']

        thresholds = np.linspace(central_thresh - a / 2,
                                 central_thresh + a / 2,
                                 N)  #list of thresholds

        for threshold in thresholds:

            for thresh in root.iter('Output'):
                thresh.attrib['minConfidence'] = str(threshold)
                print thresh.attrib['minConfidence']

            path_to_config_and_name = path_to_config + 'silk.xml'  #dconfig/your_experiment/silk.xml

            tree.write(
                path_to_config_and_name)  #write the modified xml to file

            java_command = "java -Xmx5000m -DconfigFile=%s -Dthreads=4 -jar ../lib/Silk/silk.jar" % path_to_config_and_name

            os.system(java_command)

            silk_output_name = path_to_config + output_file_name  #config/your_experiment/links.nt

            #open output file

            silk_output = open(silk_output_name, 'rU')

            for i in silk_output.readlines():
                output_file_raw.write(i)

            silk_output.close()

            output_file_raw.write('End of run\n')

            print "End of run\n"

            os.system('rm %s' % path_to_config_and_name
                      )  #remove the new modified configuration file

        output_file_raw.close()

        #create the training set, named training_set_T1_n%d.csv

        crt_training = stacking_create_training_set.stacking_create_training_set(
            path_to_file + 'ensemble_silk_output_raw_n%d.txt' % N,
            path_to_file + 'training_set_silk_n%d.csv' % N, N)
        crt_training.stacking_create_training_set_silk(gold_standard_name)

        #read it and make machine learning on it

        data = pd.read_csv(path_to_file + 'training_set_silk_n%d.csv' % N)

        X = data.values[:, 2:(N + 2)]  #x variables
        y = np.array(data['y'])  #class variables

        #fit an SVM with rbf kernel
        clf = SVC(kernel='rbf', cache_size=1000)

        parameters = {
            'gamma': np.logspace(-9, 3, 30),
            'C': np.logspace(-2, 10, 30)
        }

        gs_rbf = grid_search.GridSearchCV(clf, param_grid=parameters, cv=4)
        gs_rbf.fit(X, y)

        clf = gs_rbf.best_estimator_

        joblib.dump(clf, 'svm_model_silk_N%d_a%f.pkl' % (N, a))

        print("--- %s seconds ---" % (time.time() - start_time))
Ejemplo n.º 25
0
from starterPaulDuan import *

if __name__ == '__main__':
    #%% load data
    x_train, y_train, x_test, id_test = load_data()
    cols_drop = ['ROLE_CODE']
    x_train.drop(cols_drop, axis=1, inplace=True)
    x_test.drop(cols_drop, axis=1, inplace=True)
    x_trainb, x_testb = create_feat_ben(x_train, x_test)
    x_train = sparse.hstack((x_train, x_trainb.as_matrix())).toarray()
    x_test = sparse.hstack((x_test, x_testb.as_matrix())).tocsr()

    SEED = 0
    model_rf = ensemble.RandomForestClassifier(n_estimators=2000,
                                               max_features='sqrt',
                                               max_depth=None,
                                               min_samples_split=9,
                                               random_state=SEED,
                                               verbose=10,
                                               n_jobs=-1)
    params = {
        'n_estimators': [2500, 3000, 3500],
        'max_depth': [20],
        'min_samples_split': [3]
    }
    #    {'max_depth': 20, 'min_samples_split': 3, 'n_estimators': 2500}, 0.8910
    gridcv = grid_search.GridSearchCV(model_rf,
                                      params,
                                      scoring='roc_auc',
                                      cv=7)
    gridcv.fit(x_train, y_train)
Ejemplo n.º 26
0
    def serialize_stem_duke(self):

        start_time = time.time()

        print 'Starting the entity matching process'

        file_name = self.file_name  #define the variables
        gold_standard_name = self.gold_standard_name
        N = int(self.N)
        a = float(self.a)

        #open files for writing

        path_to_file = gold_standard_name

        path_to_file = path_to_file.split('/gs/')

        path_to_file = path_to_file[0] + '/'

        output_file_raw = open(
            path_to_file + 'ensemble_duke_output_raw_n%d.txt' % N, 'w')

        path_to_config_file = file_name.split('/')
        path_to_config_list = path_to_config_file[
            0:
            -1]  #the last element is the name of the file, I just want the path

        #turn the list into a string by iterating and summing

        path_to_config = ''

        for i in path_to_config_list:
            path_to_config += i
            path_to_config += '/'

        #output_file = open('ensemble_duke_stacking_output_T2_n%d.txt' %N,'w')

        gold_standard_read = open(gold_standard_name, 'rU')

        #iterate for each tweaked configuration

        #read actual threshold

        tree = ET.parse(file_name)
        root = tree.getroot()

        for thresh in root.iter('threshold'):
            central_thresh = float(
                thresh.text)  #central value of the threshold

        thresholds = np.linspace(central_thresh - a / 2,
                                 central_thresh + a / 2, N)

        for threshold in thresholds:

            for thresh in root.iter('threshold'):
                thresh.text = str(threshold)
                thresh.set('updated', 'yes')

            path_to_config_and_name = path_to_config + 'duke.xml'

            tree.write(path_to_config_and_name
                       )  #generate a new modified configuration file

            java_command = [
                "java", "-Xmx5000m", "-cp",
                "../lib/Duke/duke-core/target/*:../lib/Duke/duke-dist/target/*:../lib/Duke/duke-es/target/*:../lib/Duke/duke-json/target/*:../lib/Duke/duke-lucene/target/*:../lib/Duke/duke-mapdb/target/*:../lib/Duke/duke-mongodb/target/*:../lib/Duke/duke-server/target/*:../lib/Duke/lucene_jar/*",
                "no.priv.garshol.duke.Duke", "--showmatches",
                "--batchsize=100000", "--threads=4",
                "%s" % path_to_config_and_name
            ]

            output_file_raw.write(
                subprocess.check_output(java_command)
            )  #call duke on the copy.xml file and write the raw output on file

            output_file_raw.write('\n')
            output_file_raw.write('End of run\n')

            print 'End of run\n'

            os.system('rm %s' % path_to_config_and_name
                      )  #remove the new modified configuration file

        output_file_raw.close()

        #create the training set, named training_set_T1_n%d.csv

        crt_training = stacking_create_training_set.stacking_create_training_set(
            path_to_file + 'ensemble_duke_output_raw_n%d.txt' % N,
            path_to_file + 'training_set_n%d.csv' % N, N)
        crt_training.stacking_create_training_set_duke(gold_standard_name)

        #stacking_create_training_set(path_to_file+'ensemble_duke_output_raw_n%d.txt' %N,path_to_file+'training_set_n%d.csv' %N, gold_standard_name, N)

        #read it and make machine learning on it

        data = pd.read_csv(path_to_file + 'training_set_n%d.csv' % N)

        X = data.values[:, 2:(N + 2)]  #x variables
        y = np.array(data['y'])  #class variables

        #fit an SVM with rbf kernel
        clf = SVC(kernel='rbf', cache_size=1000)
        #parameters = [{'kernel' : ['rbf'],'gamma' : np.logspace(-9,3,30),'C': np.logspace(-2,10,30)}, {'kernel' : ['linear'], 'C': np.logspace(-2,10,30)}]
        parameters = {
            'gamma': np.logspace(-9, 3, 30),
            'C': np.logspace(-2, 10, 30)
        }

        gs_rbf = grid_search.GridSearchCV(clf, param_grid=parameters, cv=4)
        gs_rbf.fit(X, y)

        clf = gs_rbf.best_estimator_

        project_name = path_to_config_list[-1]

        joblib.dump(
            clf,
            '../models/%s/svm_model_duke_N%d_a%.1f.pkl' % (project_name, N, a))

        print("--- %s seconds ---" % (time.time() - start_time))
Ejemplo n.º 27
0
def build_model(featureRepresentation='image',
                dataset_file=None,
                iters=10,
                glcm_distance=1,
                glcm_isMultidirectional=False):
    '''
    Creates, trains and serialises an MLP classifier.

    Args:
        featureRepresentation: Type of features to be used in classification.
            Can ake of one of the values 'image', 'pca' or 'glcm'.

        dataset_file: filename of serialized data set upon which to build the
            MLP. If none, default dataset is used.

        iters: Number of training iterations.

        glcm_distance: Distance between pixels for co-occurence. Only used if
            featureRepresentation=glcm.

        isMultidirectional: Controls whether co-occurence should be calculated
            in other directions (ie 45 degrees, 90 degrees and 135 degrees).
            Only used if featureRepresentation=glcm.
    '''

    if (dataset_file == None):
        # Load train data
        train_filenames = []
        for filename in os.listdir("../train/positive"):
            if (filename != ".DS_Store"):
                train_filenames.append("../train/positive/" + filename)
        train_targets = [1] * (len(os.listdir("../train/positive")) - 1)

        for filename in os.listdir("../train/negative"):
            if (filename != ".DS_Store"):
                train_filenames.append("../train/negative/" + filename)
        train_targets = train_targets + [0] * (
            len(os.listdir("../train/negative")) - 1)

        n_train_samples = len(train_filenames)
        if (featureRepresentation == 'glcm'):
            if (glcm_isMultidirectional):
                sample_size = 16
            else:
                sample_size = 4
        else:
            sample_size = 20 * 20
        train_data = np.zeros((n_train_samples, sample_size))
        i = 0
        for filename in train_filenames:
            img = io.imread(filename)
            if (featureRepresentation == 'image'):
                train_data[i] = img.flatten()
            elif (featureRepresentation == 'pca'):
                train_data[i] = decomposition.PCA(
                    n_components=8).fit_transform(img.flatten())
            elif (featureRepresentation == 'glcm'):
                train_data[i] = Helper.get_textural_features(
                    img, glcm_distance, glcm_isMultidirectional)
            i = i + 1

        # Load test data
        test_filenames = []
        expected = []
        for filename in os.listdir("test"):
            if (filename != ".DS_Store"):
                test_filenames.append("../test/" + filename)
                expected.append(int(filename.split('_')[1].split('.')[0]))

        n_test_samples = len(test_filenames)
        test_data = np.zeros((n_test_samples, sample_size))
        i = 0
        for filename in test_filenames:
            img = io.imread(filename)
            if (featureRepresentation == 'image'):
                test_data[i] = img.flatten()
            elif (featureRepresentation == 'pca'):
                test_data[i] = decomposition.PCA(n_components=8).fit_transform(
                    img.flatten())
            elif (featureRepresentation == 'glcm'):
                test_data[i] = Helper.get_textural_features(
                    img, glcm_distance, glcm_isMultidirectional)
            i = i + 1
    else:
        train_data, train_targets, test_data, expected = Helper.unserialize(
            dataset_file)

    # Perform build iterations
    for i in tqdm.tqdm(range(0, iters)):
        # Build Classifier
        param_grid = {
            "algorithm": ["l-bfgs", "sgd", "adam"],
            "activation": ["logistic", "relu", "tanh"],
            "hidden_layer_sizes": [(5, 2), (5), (100), (150), (200)]
        }
        classifier = grid_search.GridSearchCV(MLPClassifier(), param_grid)
        classifier.fit(train_data, train_targets)

        # Get previous classifier and assess
        serialized_classifier = Helper.unserialize(MLP_FILE)
        if (serialized_classifier):
            predictions = serialized_classifier.predict(test_data)
            confusion_matrix = metrics.confusion_matrix(expected, predictions)
            serialized_n_correct = confusion_matrix[0][0] + confusion_matrix[
                1][1]
            predictions = classifier.predict(test_data)
            confusion_matrix = metrics.confusion_matrix(expected, predictions)
            n_correct = confusion_matrix[0][0] + confusion_matrix[1][1]
            if (n_correct > serialized_n_correct):
                Helper.serialize(MLP_FILE, classifier)
        else:
            Helper.serialize(MLP_FILE, classifier)

    # Display final model performance
    serialized_classifier = Helper.unserialize(MLP_FILE)
    predictions = serialized_classifier.predict(test_data)
    confusion_matrix = metrics.confusion_matrix(expected, predictions)
    print("Confusion matrix:\n%s" % confusion_matrix)
    print("Accuracy: %f" % metrics.accuracy_score(expected, predictions))

    return serialized_classifier
Ejemplo n.º 28
0
#        'random_state'      : [0],
#        'n_jobs'            : [4],
#        'min_samples_split' : [3],
#        'max_depth'         : [3]
#}

parameters = {
    'n_estimators': [100, 500, 1000, 1500],
    'learning_rate': [0.1, 0.05, 0.01, 0.005],
    'max_depth': [4, 6, 8, 10],
    'min_samples_leaf': [3, 5, 9, 17, 20],
    'max_features': [1.0, 0.3, 0.1]
}

clf_cv = grid_search.GridSearchCV(GradientBoostingRegressor(),
                                  parameters,
                                  cv=4,
                                  scoring='neg_mean_absolute_error')

clf_cv.fit(data_train_s, label_train_s)

print("Best Model Parameter: ", clf_cv.best_params_)
print("Best Model Score: ", clf_cv.best_score_)

print("# PREDICT..")
pre = clf_cv.predict(data_test_s)

#ac_score = metrics.accuracy_score(label_test_s, pre)
ac_score = metrics.mean_absolute_error(label_test_s, pre)
print("正解率=", ac_score)

result = clf_cv.predict(data_t)
Ejemplo n.º 29
0
                feat = []
                for i in range(0, len(parts) - 1):
                    feat.append(float(parts[i]))
                classes.append(str(int(parts[-1]) - 1))
                features.append(np.array(feat))

    fo.close()

    return (features, np.array(classes))


data_set_filepath = 'seeds_dataset.txt'
(features, classes) = read_features(data_set_filepath)
parameters = {'kernel': ['linear'], 'C': [1.4], 'gamma': [0]}
clf = svm.SVC()
clf = grid_search.GridSearchCV(clf, parameters, refit=True)
clf1 = clf.fit(features, classes)
print "\n\tMean Accuracy\tMean Error"
for n in range(3, 12):
    score = cross_validation.cross_val_score(clf,
                                             features,
                                             classes,
                                             cv=n,
                                             scoring='accuracy')
    print str(n) + '\t' + str(score.mean() * 100) + '\t' + str(
        (1 - score.mean()) * 100)

random_state = np.random.RandomState(0)
features, classes = shuffle(features, classes, random_state=random_state)

half = int(len(features) / 2)
Ejemplo n.º 30
0
def main():

    random.seed(240480)

    if use_preprocessed_data:
        print('load preprocessed data')
        df_train = pd.read_csv('data/train_processed.csv')
        df_test = pd.read_csv('data/test_processed.csv')
    else:
        df_train, df_test = load_data()

    print('configure data for training')
    id_test = df_test['id']
    y_train = df_train['relevance'].values
    X_train = df_train[:]
    X_test = df_test[:]

    print('construct model')

    # TF-IDF vectorize - converts docs to tf-idf feature matrix.
    tfidf = TfidfVectorizer(ngram_range=(1, 1), stop_words='english')

    # truncated singular value decomposition - dimensionality reduction.
    tsvd = TruncatedSVD(n_components=10, random_state=240480)

    # random forest
    rfr = RandomForestRegressor(n_estimators=500,
                                n_jobs=-1,
                                random_state=240480,
                                verbose=1)

    # TODO: get these features to include some cosine similarity measure between search term and other fields!
    # think we need to first fit tfidvectoriser to each of title, description, brand
    # and then insert into pipeline to generate 3x features of search term against the respective vocabs
    # potentially just include similarity scores as features.  or maybe RF will handle this on its own...

    # pipeline:
    # 1. build feature unions [cust_txt_col (to extract column) -> tfidf -> tsvd]
    # 2. pass to random forest.
    clf = Pipeline([('union',
                     FeatureUnion(transformer_list=[
                         ('cst', cust_regression_vals()),
                         ('txt1',
                          Pipeline([('s1', cust_txt_col(key='search_term')),
                                    ('tfidf1', tfidf), ('tsvd1', tsvd)])),
                         ('txt2',
                          Pipeline([('s2', cust_txt_col(key='product_title')),
                                    ('tfidf2', tfidf), ('tsvd2', tsvd)])),
                         ('txt3',
                          Pipeline([('s3',
                                     cust_txt_col(key='product_description')),
                                    ('tfidf3', tfidf), ('tsvd3', tsvd)])),
                         ('txt4',
                          Pipeline([('s4', cust_txt_col(key='brand')),
                                    ('tfidf4', tfidf), ('tsvd4', tsvd)]))
                     ],
                                  transformer_weights={
                                      'cst': 1.0,
                                      'txt1': 0.5,
                                      'txt2': 0.25,
                                      'txt3': 0.0,
                                      'txt4': 0.5
                                  },
                                  n_jobs=-1)), ('rfr', rfr)])

    print('run grid search')
    # TODO: search over relative weightings of transformer features?
    param_grid = {'rfr__max_features': [10], 'rfr__max_depth': [20]}
    RMSE = make_scorer(fmean_squared_error, greater_is_better=False)
    model = grid_search.GridSearchCV(estimator=clf,
                                     param_grid=param_grid,
                                     cv=2,
                                     scoring=RMSE)
    model.fit(X_train, y_train)

    print("Best parameters found by grid search:")
    print(model.best_params_)
    print("Best CV score:")
    print(model.best_score_)

    print('run predictions')
    y_pred = model.predict(X_test)

    print('save submission file')
    pd.DataFrame({
        "id": id_test,
        "relevance": y_pred
    }).to_csv('submission.csv', index=False)