Beispiel #1
0
def gbdt_lr(para):
    print("gbdt_lr")
    x_train = para[0]
    x_train_lr = para[1]
    x_test = para[2]
    y_train = para[3]
    y_train_lr = para[4]
    y_test = para[5]
    maxleafnodes = 11
    gbc = GBDT(max_leaf_nodes=maxleafnodes - 1,
               n_estimators=600,
               min_samples_leaf=5,
               max_depth=3,
               learning_rate=0.02,
               subsample=0.2,
               max_features=0.1)
    gbc.fit(x_train, y_train)
    ohe = OHE()
    ohe.fit(gbc.apply(x_train)[:, :])
    li = gbc.apply(x_train_lr)[:, :]
    x_train_lr_gbc = ohe.transform(li)
    #x_train_lr_gbc=myTransform(li,max_leaf_nodes=maxleafnodes)
    li = gbc.apply(x_test)[:, :]
    x_test_gbc = ohe.transform(li)
    #x_test_gbc=myTransform(li,max_leaf_nodes=maxleafnodes)
    del (li)
    lr = sgd(n_iter=50)
    lr.fit(x_train_lr_gbc, y_train_lr)
    yp = lr.predict(x_test_gbc)
    print("GBDT+SGD: " + str(auc(y_test, yp)))
    return (gbc, yp)
Beispiel #2
0
def sgd_test(para):
    x_train = para[0]
    x_train_lr = para[1]
    x_test = para[2]
    y_train = para[3]
    y_train_lr = para[4]
    y_test = para[5]
    xt = vstack([x_train, x_train_lr])
    yt = merge_y(y_train, y_train_lr)
    clf1 = sgd(n_iter=20, eta0=1, alpha=3)
    clf1.fit(xt, yt)
    yp_sgd = clf1.predict(x_test)
    print("SGD: " + str(auc(y_test, yp_sgd)))
Beispiel #3
0
def _sgd(t, min_freq, save=False):
    if save:
        s = sgd()
        parameters = {
            'loss':
            ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
            'penalty': ['l2', 'l1', 'none', 'elasticnet'],
            'alpha': [.00001]
        }
        clf = GridSearchCV(s, parameters, cv=5)
        clf.fit(records, labels)
        save_classifier(clf.best_estimator_, t, 'sgd', min_freq)
        return ('sgd', clf.best_estimator_)
    else:
        clf = load_classifier(t, 'sgd', min_freq)
        return ('sgd', clf)
Beispiel #4
0
def hyperopt_train_test(params):
    X_ = X[:]
    """
    if 'normalize' in params:
        if params['normalize'] == 1:
            X_ = normalize(X_)
            del params['normalize']
        else:
            del params['normalize']
    if 'scale' in params:
        if params['scale'] == 1:
            X_ = scale(X_)
            del params['scale']
        else:
            del params['scale']
    """
    clf = sgd(**params)
    return cross_val_score(clf, X_, y, cv=5).mean()
Beispiel #5
0
    print "\nBuilding a Modal"
    print "=" * 20
    if sys.argv[1] == 'logistic':
        modal = lr()
    
    elif sys.argv[1] == 'naivebayes':
        modal = gn()

    elif sys.argv[1] == 'randomforest':
        modal = rf()

    elif sys.argv[1] == 'voting':
        modal = vc(estimators = [
            ('lr', lr()), ('rf', rf()), ('gnb', gn()), 
            ('dt', tree.DecisionTreeClassifier()), ('sgd_log', sgd(loss="log")),
            ('sgd_hinge', sgd(loss='modified_huber'))
            ], voting='soft')
    else:
        print "\nUsage: python main.py <jaccard|cosine|tfidf|logistic|naivebayes|randomforest|voting>\n"
        sys.exit(1) 
 
    modal = modal.fit(df_train[df_train.columns[2:]], df_train['V'])

print "\nParsing test data"
print "=" * 20

test_data = similarity("./test.csv", False)
df_test = pd.DataFrame(test_data, columns=["id", "J", "C", "T"])

if mod == 0:
Beispiel #6
0
converted = process_data.time_series(converted)

dataset_train = np.array(converted[:7000],
                         dtype='float64')  # 7000 samples for training
dataset_test = np.array(converted[7000:],
                        dtype='float64')  # 601 samples for testing

dataset_train_x = preprocessing.scale(dataset_train[:, :-1])
dataset_train_y = dataset_train[:, -1]
dataset_test_x = preprocessing.scale(dataset_test[:, :-1])
dataset_test_y = dataset_test[:, -1]

print('------ Begin experiments...')

# Linear regressions (SGD)
clf = sgd(loss='huber', penalty='l1', max_iter=8)
clf.fit(dataset_train_x, dataset_train_y)
print("Linear Regression (SGD) Training Score: {}".format(
    round(clf.score(dataset_train_x, dataset_train_y), 2)))
print("Linear Regression (SGD) Testing Score: {}".format(
    round(clf.score(dataset_test_x, dataset_test_y), 2)))

# Bagging with decesion stumps
clf = bagging(n_estimators=200, oob_score=True)
clf.fit(dataset_train_x, dataset_train_y)
print("Bagging Training Score: {}".format(
    round(clf.score(dataset_train_x, dataset_train_y), 2)))
print("Bagging Testing Score: {}".format(
    round(clf.score(dataset_test_x, dataset_test_y), 2)))

# Adaboost
Beispiel #7
0
		for appid in appInstall:
			xsparse[i,apps_index[appid]]=1
	return(xsparse,y)

if __name__=='__main__':
	import sys
	sys.path.append('/data/liangyiting/gbdt')

	pathIn='/data/liangyiting/gbdt/log'
	x,y=f_tomat(pathIn)#	
	li=np.random.uniform(0,1,len(y));i=np.argsort(li);x=x[i];y=y[i];#打乱样本
	k=int(0.6*len(y));xt=x[:k];yt=y[:k];xv=x[k:];yv=y[k:]#切分训练集和测试集
	import pdb
#	pdb.set_trace()
	from sklearn.metrics import roc_auc_score as auc
	from sklearn.linear_model import SGDRegressor as sgd
	from sklearn.ensemble import GradientBoostingRegressor as gbdt
	clf=gbdt();
	clf.subsample=0.2;clf.max_features=0.05;clf.min_samples_leaf=5;
#	clf.max_leaf_nodes=30;
	clf.n_estimators=200;
	clf.learning_rate=0.03;

	clf.fit(xt,yt);yp_gbdt=clf.predict(xv.toarray());print(auc(yv,yp_gbdt))

	clf1=sgd()
	clf1.fit(xt,yt);yp_sgd=clf1.predict(xv);print(auc(yv,yp_sgd))

	print(auc(yv,yp_gbdt+yp_sgd))
	
Beispiel #8
0
    n = 0
    for data in train_data:
        train_text.append(data[1])
        labels.append(data[2])

    vectorizer = cv(encoding='utf-8',
                    strip_accents='unicode',
                    ngram_range=(1, 1),
                    decode_error='replace')
    vector_data = vectorizer.fit_transform(train_text)

    model_selector = model_selection
    X_train, X_test, y_train, y_test = model_selector.train_test_split(
        vector_data, labels, stratify=labels, test_size=0.2)

    classifier = sgd(loss='hinge', penalty='l1')
    classifier.fit(X_train, y_train)

    train_scores = classifier.score(X_train, y_train)
    print('Unigram Results')
    print('Train Scores')
    print(train_scores)
    print("Accuracy: %0.2f (+/- %0.2f)" %
          (train_scores.mean(), train_scores.std() * 2))

    test_scores = classifier.score(X_test, y_test)
    print('Test Scores')
    print(test_scores)
    print("Accuracy: %0.2f (+/- %0.2f)" %
          (test_scores.mean(), test_scores.std() * 2))
    print('')
Beispiel #9
0
    'min_samples_leaf': [1, 2, 3]
}
rfc_model = rfc()
use_GridSearch(rfc_model, rfc_parameters, train_centroids)

# Logistic Regression
lr_parameters = {
    'C': [0.005, 0.01, 0.05],
    'max_iter': [4, 5, 6],
    'fit_intercept': [True]
}
lr_model = lr()
use_GridSearch(lr_model, lr_parameters, train_centroids1)

sgd_parameters = {'loss': ['log'], 'penalty': ['l1', 'l2', 'none']}
sgd_model = sgd()
use_GridSearch(sgd_model, sgd_parameters, train_centroids1)


def use_model(model, x_values):
    scores = cross_val_score(model,
                             x_values,
                             train.sentiment,
                             cv=5,
                             scoring='roc_auc')
    model.fit(x_values, train.sentiment)
    mean_score = round(np.mean(scores) * 100, 2)
    print(scores)
    print()
    print("Mean score = {}".format(mean_score))
lr_parameters = {'C':[0.005,0.01,0.05],
                 'max_iter':[4,5,6],
                 'fit_intercept': [True]}

lr_model = lr()

use_GridSearch(lr_model, lr_parameters, train_centroids)


# In[49]:

# Stochastic Gradient Descent Classifier 
sgd_parameters = {'loss': ['log'],
                  'penalty': ['l1','l2','none']}

sgd_model = sgd()

use_GridSearch(sgd_model, sgd_parameters, train_centroids)


# Let's double check the quality of the classifiers with cross validation, then train them.

# In[51]:

def use_model(model, x_values):
    '''
    Test the quality of a model using cross validation
    Train the model with the x_values
    '''
    scores = cross_val_score(model, x_values, train.sentiment, cv = 5, scoring = 'roc_auc')
    model.fit(x_values, train.sentiment)