def gbdt_lr(para): print("gbdt_lr") x_train = para[0] x_train_lr = para[1] x_test = para[2] y_train = para[3] y_train_lr = para[4] y_test = para[5] maxleafnodes = 11 gbc = GBDT(max_leaf_nodes=maxleafnodes - 1, n_estimators=600, min_samples_leaf=5, max_depth=3, learning_rate=0.02, subsample=0.2, max_features=0.1) gbc.fit(x_train, y_train) ohe = OHE() ohe.fit(gbc.apply(x_train)[:, :]) li = gbc.apply(x_train_lr)[:, :] x_train_lr_gbc = ohe.transform(li) #x_train_lr_gbc=myTransform(li,max_leaf_nodes=maxleafnodes) li = gbc.apply(x_test)[:, :] x_test_gbc = ohe.transform(li) #x_test_gbc=myTransform(li,max_leaf_nodes=maxleafnodes) del (li) lr = sgd(n_iter=50) lr.fit(x_train_lr_gbc, y_train_lr) yp = lr.predict(x_test_gbc) print("GBDT+SGD: " + str(auc(y_test, yp))) return (gbc, yp)
def sgd_test(para): x_train = para[0] x_train_lr = para[1] x_test = para[2] y_train = para[3] y_train_lr = para[4] y_test = para[5] xt = vstack([x_train, x_train_lr]) yt = merge_y(y_train, y_train_lr) clf1 = sgd(n_iter=20, eta0=1, alpha=3) clf1.fit(xt, yt) yp_sgd = clf1.predict(x_test) print("SGD: " + str(auc(y_test, yp_sgd)))
def _sgd(t, min_freq, save=False): if save: s = sgd() parameters = { 'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'], 'penalty': ['l2', 'l1', 'none', 'elasticnet'], 'alpha': [.00001] } clf = GridSearchCV(s, parameters, cv=5) clf.fit(records, labels) save_classifier(clf.best_estimator_, t, 'sgd', min_freq) return ('sgd', clf.best_estimator_) else: clf = load_classifier(t, 'sgd', min_freq) return ('sgd', clf)
def hyperopt_train_test(params): X_ = X[:] """ if 'normalize' in params: if params['normalize'] == 1: X_ = normalize(X_) del params['normalize'] else: del params['normalize'] if 'scale' in params: if params['scale'] == 1: X_ = scale(X_) del params['scale'] else: del params['scale'] """ clf = sgd(**params) return cross_val_score(clf, X_, y, cv=5).mean()
print "\nBuilding a Modal" print "=" * 20 if sys.argv[1] == 'logistic': modal = lr() elif sys.argv[1] == 'naivebayes': modal = gn() elif sys.argv[1] == 'randomforest': modal = rf() elif sys.argv[1] == 'voting': modal = vc(estimators = [ ('lr', lr()), ('rf', rf()), ('gnb', gn()), ('dt', tree.DecisionTreeClassifier()), ('sgd_log', sgd(loss="log")), ('sgd_hinge', sgd(loss='modified_huber')) ], voting='soft') else: print "\nUsage: python main.py <jaccard|cosine|tfidf|logistic|naivebayes|randomforest|voting>\n" sys.exit(1) modal = modal.fit(df_train[df_train.columns[2:]], df_train['V']) print "\nParsing test data" print "=" * 20 test_data = similarity("./test.csv", False) df_test = pd.DataFrame(test_data, columns=["id", "J", "C", "T"]) if mod == 0:
converted = process_data.time_series(converted) dataset_train = np.array(converted[:7000], dtype='float64') # 7000 samples for training dataset_test = np.array(converted[7000:], dtype='float64') # 601 samples for testing dataset_train_x = preprocessing.scale(dataset_train[:, :-1]) dataset_train_y = dataset_train[:, -1] dataset_test_x = preprocessing.scale(dataset_test[:, :-1]) dataset_test_y = dataset_test[:, -1] print('------ Begin experiments...') # Linear regressions (SGD) clf = sgd(loss='huber', penalty='l1', max_iter=8) clf.fit(dataset_train_x, dataset_train_y) print("Linear Regression (SGD) Training Score: {}".format( round(clf.score(dataset_train_x, dataset_train_y), 2))) print("Linear Regression (SGD) Testing Score: {}".format( round(clf.score(dataset_test_x, dataset_test_y), 2))) # Bagging with decesion stumps clf = bagging(n_estimators=200, oob_score=True) clf.fit(dataset_train_x, dataset_train_y) print("Bagging Training Score: {}".format( round(clf.score(dataset_train_x, dataset_train_y), 2))) print("Bagging Testing Score: {}".format( round(clf.score(dataset_test_x, dataset_test_y), 2))) # Adaboost
for appid in appInstall: xsparse[i,apps_index[appid]]=1 return(xsparse,y) if __name__=='__main__': import sys sys.path.append('/data/liangyiting/gbdt') pathIn='/data/liangyiting/gbdt/log' x,y=f_tomat(pathIn)# li=np.random.uniform(0,1,len(y));i=np.argsort(li);x=x[i];y=y[i];#打乱样本 k=int(0.6*len(y));xt=x[:k];yt=y[:k];xv=x[k:];yv=y[k:]#切分训练集和测试集 import pdb # pdb.set_trace() from sklearn.metrics import roc_auc_score as auc from sklearn.linear_model import SGDRegressor as sgd from sklearn.ensemble import GradientBoostingRegressor as gbdt clf=gbdt(); clf.subsample=0.2;clf.max_features=0.05;clf.min_samples_leaf=5; # clf.max_leaf_nodes=30; clf.n_estimators=200; clf.learning_rate=0.03; clf.fit(xt,yt);yp_gbdt=clf.predict(xv.toarray());print(auc(yv,yp_gbdt)) clf1=sgd() clf1.fit(xt,yt);yp_sgd=clf1.predict(xv);print(auc(yv,yp_sgd)) print(auc(yv,yp_gbdt+yp_sgd))
n = 0 for data in train_data: train_text.append(data[1]) labels.append(data[2]) vectorizer = cv(encoding='utf-8', strip_accents='unicode', ngram_range=(1, 1), decode_error='replace') vector_data = vectorizer.fit_transform(train_text) model_selector = model_selection X_train, X_test, y_train, y_test = model_selector.train_test_split( vector_data, labels, stratify=labels, test_size=0.2) classifier = sgd(loss='hinge', penalty='l1') classifier.fit(X_train, y_train) train_scores = classifier.score(X_train, y_train) print('Unigram Results') print('Train Scores') print(train_scores) print("Accuracy: %0.2f (+/- %0.2f)" % (train_scores.mean(), train_scores.std() * 2)) test_scores = classifier.score(X_test, y_test) print('Test Scores') print(test_scores) print("Accuracy: %0.2f (+/- %0.2f)" % (test_scores.mean(), test_scores.std() * 2)) print('')
'min_samples_leaf': [1, 2, 3] } rfc_model = rfc() use_GridSearch(rfc_model, rfc_parameters, train_centroids) # Logistic Regression lr_parameters = { 'C': [0.005, 0.01, 0.05], 'max_iter': [4, 5, 6], 'fit_intercept': [True] } lr_model = lr() use_GridSearch(lr_model, lr_parameters, train_centroids1) sgd_parameters = {'loss': ['log'], 'penalty': ['l1', 'l2', 'none']} sgd_model = sgd() use_GridSearch(sgd_model, sgd_parameters, train_centroids1) def use_model(model, x_values): scores = cross_val_score(model, x_values, train.sentiment, cv=5, scoring='roc_auc') model.fit(x_values, train.sentiment) mean_score = round(np.mean(scores) * 100, 2) print(scores) print() print("Mean score = {}".format(mean_score))
lr_parameters = {'C':[0.005,0.01,0.05], 'max_iter':[4,5,6], 'fit_intercept': [True]} lr_model = lr() use_GridSearch(lr_model, lr_parameters, train_centroids) # In[49]: # Stochastic Gradient Descent Classifier sgd_parameters = {'loss': ['log'], 'penalty': ['l1','l2','none']} sgd_model = sgd() use_GridSearch(sgd_model, sgd_parameters, train_centroids) # Let's double check the quality of the classifiers with cross validation, then train them. # In[51]: def use_model(model, x_values): ''' Test the quality of a model using cross validation Train the model with the x_values ''' scores = cross_val_score(model, x_values, train.sentiment, cv = 5, scoring = 'roc_auc') model.fit(x_values, train.sentiment)