Esempio n. 1
0
def initialize_models(X_train, y_train, X_test, y_test, accuracy, fscore):
    # TODO: Initialize the three models
    clf_A = dtc(random_state=13)
    clf_B = rfc(random_state=13)
    clf_C = abc(random_state=13)

    # TODO: Calculate the number of samples for 1%, 10%, and 100% of the training data
    # HINT: samples_100 is the entire training set i.e. len(y_train)
    # HINT: samples_10 is 10% of samples_100 (ensure to set the count of the values to be `int` and not `float`)
    # HINT: samples_1 is 1% of samples_100 (ensure to set the count of the values to be `int` and not `float`)
    samples_100 = len(y_train)
    samples_10 = len(y_train) // 10
    samples_1 = len(y_train) // 100

    # Collect results on the learners
    results = {}
    for clf in [clf_A, clf_B, clf_C]:
        clf_name = clf.__class__.__name__
        results[clf_name] = {}
        for i, samples in enumerate([samples_1, samples_10, samples_100]):
            results[clf_name][i] = train_predict(clf, samples, X_train, y_train, X_test, y_test)

    # Run metrics visualization for the three supervised learning models chosen
    vs.evaluate(results, accuracy, fscore)
    return clf_C
 def __init__(self, pathToData):
     self.dataFilePath = pathToData
     self.algoname = 'Boosting'
     self.datasetName = 'Letter'
     self.baseEstimater = dtc(class_weight='balanced')
     # x = {'base_estimator': self.baseEstimater,
     #      'base_estimator__max_depth': 15}
     self.classifier = abc(base_estimator=self.baseEstimater,
                           algorithm='SAMME')
     # self.classifier.set_params(**x)
     self.cv = 5
def train(df):
    '''This function trains the data on 4 different SVC model kernels:
    1. Linear Kernel
    2. Polynomial Kernel
    3. Radial Basis Function Kernel
    4. Sigmoid Kernel
    The RFC model is also implemented.
    
    The hyperparameters are set default in each case.
    The score of the model on the Dev/Test set is returned to the main script.
    '''
    X, y = preprocess_data.addFeatures(df)
    X_train, X_test, y_train, y_test = preprocess_data.splitDataset(X, y)
    X_train, X_test = preprocess_data.featureScaling(X_train, X_test)

    model_slinear = svm.SVC(kernel='linear')
    model_slinear.fit(X_train, y_train)
    score_slinear = model_slinear.score(X_test, y_test)

    model_spoly = svm.SVC(kernel='poly')
    model_spoly.fit(X_train, y_train)
    score_spoly = model_spoly.score(X_test, y_test)

    model_srbf = svm.SVC(kernel='rbf')
    model_srbf.fit(X_train, y_train)
    score_srbf = model_srbf.score(X_test, y_test)

    model_ssig = svm.SVC(kernel='sigmoid')
    model_ssig.fit(X_train, y_train)
    score_ssig = model_ssig.score(X_test, y_test)

    model_rfc = rfc(max_depth=4, random_state=0)
    model_rfc.fit(X_train, y_train)
    score_rfc = model_rfc.score(X_test, y_test)

    model_abc = abc(n_estimators=500)
    model_abc.fit(X_train, y_train)
    score_abc = model_abc.score(X_test, y_test)

    model_vc = VotingClassifier(estimators=[(
        'svc',
        model_srbf,
    ), ('rf', model_rfc)],
                                voting='hard')
    model_vc.fit(X_train, y_train)
    score_vc = model_vc.score(X_test, y_test)

    return score_slinear, score_spoly, score_srbf, score_ssig, score_rfc, score_abc, score_vc
Esempio n. 4
0
def AdaBoost():
	trainX, trainY, testX, testY = load_data(['common_neigh', 'check_common_time_spot','common_crt_time_spot', 'dist_common_spot', 'shortest_path', 'katzB' ,'adamic_adar', 'mean_distance'])
	print('load data completely')
	clf = abc(n_estimators=300)
	clf.fit(trainX, trainY)
	print('AdaBooting completely')
	print(clf.feature_importances_)
	testDict, testList = test_index(testX)
	test_size = len(testDict)
	predictY = clf.predict(testX)
	with open('adaboost_predict.txt', 'w') as f:
		for i in range(test_size):
			print(testList[i][0], testList[i][1], predictY[i], file=f)
	scores = clf.score(testX, testY)
	print('predict testing data completely')
	print('Accuracy in sample =', scores)
Esempio n. 5
0
def train_abc(exp_depth, exp_lr):
    train = np.load('train_vars.npy')
    val = np.load('val_vars.npy')
    train_labels = np.load('train_labels.npy').ravel()
    val_labels = np.load('val_labels.npy').ravel()
    val_size = len(val_labels)
    abc_model = abc(base_estimator=DecisionTreeClassifier(max_depth=exp_depth),
                    learning_rate=exp_lr)

    abc_model.fit(train, train_labels)
    predictions = abc_model.predict(val)

    correct = np.sum(np.equal(predictions, val_labels))
    accuracy = correct / float(val_size)
    result = 1 - accuracy
    result = float(result)

    print 'Result = %f' % result
    #time.sleep(np.random.randint(60))
    return result
Esempio n. 6
0
def model_tunings_abc(X_train, y_train, X_test, y_test):

    # Initialize the classifier
    base_model = rfc()
    clf = abc(base_estimator=base_model, random_state=13)

    # TODO: Create the parameters list you wish to tune, using a dictionary if needed.
    # HINT: parameters = {'parameter_1': [value1, value2], 'parameter_2': [value1, value2]}
    parameters = {'learning_rate': [0.02, 0.04, 0.2], 'n_estimators': [75, 100, 150]}

    # TODO: Make an fbeta_score scoring object using make_scorer()
    scorer = make_scorer(fbeta_score, beta=0.5)

    # TODO: Perform grid search on the classifier using 'scorer' as the scoring method using GridSearchCV()
    grid_obj = GridSearchCV(clf, parameters, scoring=scorer)

    # TODO: Fit the grid search object to the training data and find the optimal parameters using fit()
    grid_fit = grid_obj.fit(X_train, y_train.ravel())

    # Get the estimator
    best_clf = grid_fit.best_estimator_

    # Make predictions using the unoptimized and model
    predictions = (clf.fit(X_train, y_train.ravel())).predict(X_test)
    best_predictions = best_clf.predict(X_test)

    # Report the before-and-afterscores
    print("Unoptimized model\n------")
    # print("Accuracy score on testing data: {:.4f}".format(accuracy_score(y_test, predictions)))
    print("Accuracy score on testing data: {0}".format(accuracy_score(y_test, predictions)))
    # print("F-score on testing data: {:.4f}".format(fbeta_score(y_test, predictions, beta = 0.5)))
    print("F-score on testing data: {0}".format(fbeta_score(y_test, predictions, beta=0.5)))
    print("\nOptimized Model\n------")
    # print("Final accuracy score on the testing data: {:.4f}".format(accuracy_score(y_test, best_predictions)))
    print("Final accuracy score on the testing data: {0}".format(accuracy_score(y_test, best_predictions)))
    # print("Final F-score on the testing data: {:.4f}".format(fbeta_score(y_test, best_predictions, beta = 0.5)))
    print("Final F-score on the testing data: {0}".format(fbeta_score(y_test, best_predictions, beta=0.5)))

    return best_clf,best_predictions
Esempio n. 7
0
# In[ ]:

# アルゴリズムにロジスティック回帰を採用
lr = LogisticRegression(C=1000)

# fit関数で学習開始
lr.fit(X_train, y_train)
y_test_pred = lr.predict(X_test)
y_test_pred

# In[ ]:

# Adaboostなるものをためしてみる
from sklearn.ensemble import AdaBoostClassifier as abc
bdt = abc()

bdt.fit(X_train, y_train)
y_test_ada = bdt.predict(X_test)
y_test_ada

# In[ ]:

# 次は決定木
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=0)
clf.fit(X_train, y_train)
y_test_dtc = clf.predict(X_test)
y_test_dtc

# In[ ]:
Esempio n. 8
0
             Scale_NumCols(['Age', 'SibSp', 'Parch', 'Fare'], take_log=True))
pipeline = Pipeline([('deal_na', Deal_NAs()),
                     ('encode_cat', Encode_CatCols(drop=['Name', 'Ticket'])),
                     scale_num])
#X_prepared = pipeline.fit_transform(X_)
X_train_p = pipeline.fit_transform(X_train)
X_vali_p = pipeline.transform(X_vali)

from sklearn.linear_model import LogisticRegression as lr
from sklearn.tree import DecisionTreeClassifier as dtc
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.ensemble import AdaBoostClassifier as abc
from sklearn.ensemble import GradientBoostingClassifier as gbc
model = lr(C=1)
model = dtc(min_samples_split=10, max_features=5)
model = abc(dtc(max_depth=4), n_estimators=100)
model = gbc(n_estimators=200)
#model = rfc(n_estimators=200 ,min_samples_split = 5)
model.fit(X_train_p, Y_train)
# print(model.score(X_train_p, Y_train))
# print(model.score(X_vali_p, Y_vali))
# coef_df = pd.DataFrame({'name':X_train_p.columns.tolist(), 'coef':model.coef_[0]})
# coef_df.sort_values('coef', ascending = False)

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
Y_pred = model.predict(X_vali_p)
print(classification_report(Y_vali, Y_pred))

print(submit.head())
from cross_validation import cross_validation as CV
import matplotlib.pyplot as plt
from feature_selection import feature_selection

#Loading data
x_train = np.loadtxt('../Data/x_train.txt')
y_train_binary = np.loadtxt('../Data/y_train_binary.txt')
x_test = np.loadtxt('../Data/x_test.txt')
y_test_binary = np.loadtxt('../Data/y_test_binary.txt')
x_orig_train = np.loadtxt('../Data/x_orig_train.txt')
y_orig_train_binary = np.loadtxt('../Data/y_orig_train_binary.txt')
x_final_test = np.loadtxt('../Data/x_final_test.txt')
y_final_test_binary = np.loadtxt('../Data/y_final_test_binary.txt')

#Modeling classifier
clf = abc()

#Calling feature selection methods
fs = feature_selection()
#clf,x_train,x_test,x_final_test,y_out = fs.PCASelection(x_train,y_train_binary,x_test,y_test_binary,x_final_test,clf)
#clf,x_train,x_test,x_final_test,y_out = fs.KBest(x_train,y_train_binary,x_test,y_test_binary,x_final_test,clf)
clf.fit (x_train,y_train_binary)
y_out = clf.predict(x_test)

#Printing scores
score = clf.score(x_test,y_test_binary)
print "Score : ", score
print "Precision recall f-score support : " , prfs(y_test_binary,y_out)


#Cross validation
Esempio n. 10
0
def experiment(model, textmodel):

    trade_days = [30, 60]
    nos = [1101]
    all_predict = []
    all_labels = []

    for stockno in nos:
        for days in trade_days:
            data = feature_label(stockno, days, textmodel)

            feature = []
            for i, j in data[2].items():
                for vec in j:
                    feature.append(vec)
            feature = np.array(feature)
            test_scaler = StandardScaler().fit(feature)

            x_train = np.array(data[0])
            y_train = np.array(data[1])

            train_scaler = StandardScaler().fit(x_train)
            x_train = train_scaler.transform(x_train)

            if (model == 'ns'):
                clf = ns(kernel='linear').fit(x_train, y_train)
            elif (model == 'abc'):
                clf = abc(learning_rate=1,
                          n_estimators=100).fit(x_train, y_train)
            elif (model == 'lr'):
                clf = lr().fit(x_train, y_train)

            x_test = []
            pv = 0
            pk = 0
            y_test = {}
            predict = {}
            for i, j in data[2].items():
                tmp = 0
                if (j != []):
                    #     ///////   vote   ///////
                    for vec in j:
                        tmp += clf.predict(
                            test_scaler.transform(vec.reshape(1, -1)))
                    if (tmp > 0):
                        pv = 1
                    else:
                        pv = -1
    #     ///////   KMeans   ///////
                    buf = []
                    for vec in j:
                        buf.append(vec)
                    center = KMeans(n_clusters=1).fit(buf)
                    result = test_scaler.transform(
                        np.array(center.cluster_centers_[0]).reshape(1, -1))
                    x_test.append(result)
                    if (clf.predict(result) > 0):
                        pk = 1
                    else:
                        pk = -1


#          pk = clf.predict(result)

#          y_test.append(data[3][i])
                    y_test[i] = data[3][i]
                    predict[i] = [pv, pk]
            all_predict.append(predict)
            all_labels.append(y_test)
            print("StockNo: {}, {} trade days done".format(stockno, days))
    return all_labels, all_predict
Esempio n. 11
0
    print 'correct_predict =', correct_predict
    print 'precision =', precision
    print 'recall =', recall
    print 'f1_score =', f1_score
    return f1_score


with open('clcntt_randfix01.pickle', 'rb') as f:
    data = pickle.load(f)
train_label = data['train_label']
train_data = data['train_data']
test_label = data['test_label']
test_data = data['test_data']
column_names = data['col_names']
del data

trainer = abc(n_estimators=100, learning_rate=0.9).fit(train_data, train_label)
tr_prediction = trainer.predict(test_data)
f1Score(tr_prediction, test_label)
"""
dot_data=StringIO()
tree.export_graphviz(trainer,
                     out_file=dot_data,
                     feature_names=column_names,
                     class_names=['innocent','sin'],
                     filled=True, rounded=True,
                     impurity=False,max_depth=6,rotate=True
                     )
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_pdf("test.pdf")
"""
    print 'finished, file saved : ',excel_name

with open(picklename,'rb') as f:
    data=pickle.load(f)
test_labels =data['test_label']
test_dataset =data['test_data']
train_labels =data['train_label']
train_dataset =data['train_data']
column_names =data['col_names']
submit_dataset =data['submit_data']
submit_custid =data['submit_custid']
del data

#조건별로 트레이너와 변수 세팅
if trainer_select =='adaboost':
    trainer = abc(n_estimators=n_estimators,learning_rate=learning_rate).fit(train_dataset,train_labels)
elif trainer_select =='randomforest':
    trainer = rf(n_estimators=n_estimators).fit(train_dataset,train_labels)
elif trainer_select =='tree': #tree 는 pdf 파일도 작성
    trainer = tree.DecisionTreeClassifier().fit(train_dataset,train_labels)
    if ifpdf==1: #필요하다면 pdf로 출력
        dot_data=StringIO()
        tree.export_graphviz(trainer,
                             out_file=dot_data,
                             feature_names=column_names,
                             class_names=['innocent','sin'],
                             filled=True, rounded=True,
                             impurity=False,max_depth=6,rotate=True
                             )
        graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
        graph.write_pdf(pdfname)
Esempio n. 13
0
##Fourth Model Approach: Boosted Trees (BT) - How does BT compare with RF and the rest of my attempts out-of-the-box? What tuning may be needed?

if (alg3):
    df_bst = pd.get_dummies(df, columns=['sales', 'salary'])
    df_train_bst = pd.get_dummies(df_train, columns=['sales', 'salary'])
    df_test_bst = pd.get_dummies(df_test, columns=['sales', 'salary'])
    X = df_train_bst[[
        'satisfaction_level', 'salary_low', 'salary_medium', 'salary_high',
        'number_project'
    ] + [v for v in df_train_bst.columns if re.search('sales', v) is not None]]
    X_test = df_test_bst[[
        'satisfaction_level', 'salary_low', 'salary_medium', 'salary_high',
        'number_project'
    ] + [v for v in df_test_bst.columns if re.search('sales', v) is not None]]
    c3_result = abc(random_state=1234)
    c3_result.fit(X, df_train_bst['left'])

    #Overall  fit is markedly better than previous attempts
    #Decision boundary is implicitly 50% in score() method
    print('\nRandom Forest OOS Results\n')
    print('Average Accuracy ' +
          str(c3_result.score(X_test, df_test_bst['left'])))

    #4-Fold CV done twice each fold to check average OOS accuracy reported above

    rkf = RepeatedKFold(n_splits=4, n_repeats=2, random_state=12883823)
    cv_score_list = []
    for (train, test) in rkf.split(df_bst[[
            'satisfaction_level', 'salary_low', 'salary_medium', 'salary_high',
            'number_project'
Esempio n. 14
0
#
# for i in range(len(test_x)):
#     test_x_vec = ""
#     line = test_x.iloc[i]
#     for j in line.split():
#         j = p.stem(j)
#         test_x_vec += j
#     test_x.iloc[i] = test_x_vec
# =============================================================================

# TFIDF方法
vectorizer = TfidfVectorizer()
x_train_tfidf = vectorizer.fit_transform(train_x)
x_input_tfidf = vectorizer.transform(test_x)
# 构建模型 朴素贝叶斯
model = abc(n_estimators=600)
model.fit(x_train_tfidf, train_y)

#
predicted = model.predict(x_input_tfidf)
#print(confusion_matrix(predicted, test_y))
np.savetxt("data/res.csv", predicted, delimiter=',', fmt='%s')

#print('Accuracy score: ', format(accuracy_score(test_y, predicted)))
#print('Precision score: ', format(precision_score(test_y, predicted)))
#print('Recall score: ', format(recall_score(test_y, predicted)))
#print('F1 score: ', format(f1_score(test_y, predicted)))

# =============================================================================
# #output some examples
# category_map = {'ham':0, 'spam':1}
Esempio n. 15
0
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

# Provided to give you a starting point. Try a variety of classifiers.
from sklearn.naive_bayes import GaussianNB as gnb
#from sklearn.linear_model import LogisticRegression as lr
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.ensemble import AdaBoostClassifier as abc
from sklearn.ensemble import GradientBoostingClassifier as gbc
#from sklearn.svm import SVC as svc

clf1 = gnb()
#clf2 = lr()
clf3 = rfc()
clf4 = abc()
clf5 = gbc()
#clf6 = svc()

### Task 5: Tune your classifier to achieve better than .3 precision and recall
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info:
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

# Example starting point. Try investigating other evaluation techniques!
from sklearn.cross_validation import train_test_split
from sklearn import metrics as mtr
features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.3, random_state=42)