Exemple #1
0
# 抽样生成训练集、测试集
# mode 0 - nqso是s82 std star的(作废!)
# mode 1 - nqso是做了iband filter并附加了dr7 quasar catalog(作废!)
# mode 2 - 我们自己乱搞的(作废!)
# mode 3 - 所有源i_band < 19.0
# mode 4 - 光谱认证(clean)
train_data, train_label, test_data, test_label = load_data_set(
    mode, feature_mode, seed)

# 1 - PCA预处理 - 数据降维(作废!)
# pca, new_data = do_PCA(train_data)
new_data = train_data

# 2 - 数据格式标准化
feature = get_feature(feature_mode)
# feature = list(map(str, range(pca.n_components_)))
X, Y, vec = std_data(new_data, train_label, feature)
Y = Y.reshape(len(Y))

# 3 - 决策树训练
dtc = decision_tree(X, Y)
# print_feature_importance(dtc, vec.get_feature_names())

# 4 - 随机森林训练
rfc = random_forest(X, Y)
# print_feature_importance(rfc, vec.get_feature_names())

# 5 - AdaBoost训练
# abc = adaptive_boost(X, Y)
# print_feature_importance(abc, vec.get_feature_names())
Exemple #2
0
    return count, folder


if __name__ == '__main__':
    num, folder = count_file()
    to_save = list()
    length = list()
    for i in range(1, num):
        file_name = get_txt(folder, 40, 17, i)
        my = np.loadtxt(file_name + '.txt', skiprows=0)
        my = da.get_sig(my)
        length.append(len(my))
        #         plt.figure()
        #         plt.plot(my)
        #         plt.show()
        features = da.get_feature(my)
        #features.append(i)
        to_save.append(features)
        #print(features)
    to_save = array(to_save)
    #print(to_save)
    print(length)
    print(sum(length) - length[0])
    j = 0
    file_name = get_txt(folder, 40, 17, j)
    momsig = np.loadtxt(file_name + '.txt', skiprows=0)
    momsig = da.get_sig(momsig)
    print("features extraction done...\n\n")

    plt.figure()
    plt.plot(momsig, 'b')
# -*- coding: utf-8 -*-

# 
# prediction of unlabeled sources in s82 variable source catalog
# author: topol @ USTC
# last modified: 2019/3/30
#
from data_util import std_data, get_feature, load_data
from sklearn.externals import joblib
import pandas as pd

rfc = joblib.load('./model/rf_4_all.m')
unlabeled_data, unlabeled_label, unlabeled_ID = load_data(filename='./train/other_sample_data4', mode='all')
feature = get_feature()
X_unlabeled, Y_unlabeled, vec_unlabeled = std_data(unlabeled_data, unlabeled_label, feature)
Y_predict = rfc.predict(X_unlabeled)

out_file = open('./result/mode_4_predict', "w+")
for i in zip(unlabeled_ID, Y_predict):
    print(int(float(i[0])), i[1], file=out_file)

print(pd.value_counts(Y_predict))
    # 'param' and 'range' should be lists
    param_test = dict(zip(param, range))
    gbc = GradientBoostingClassifier(learning_rate=0.6, n_estimators=65)
    gsearch = GridSearchCV(estimator=gbc,
                           param_grid=param_test,
                           scoring='roc_auc',
                           cv=5)
    gsearch.fit(X, Y)
    # gsearch.grid_scores_, gsearch.best_params_, gsearch.best_score_
    print_best_score(gsearch, param_test)


# 1 - load data
train_data, train_label, test_data, test_label = load_data_set(
    2, 7200, 1100, 42)
feature = get_feature('./train/raw/test_sample_data_1')
X, Y, vec = std_data(train_data, train_label, feature)
Y = Y.reshape(len(Y))

# 2 - do gricSearchCV

# for dtc
# param = ["max_depth", "min_samples_split", "min_samples_leaf", "max_leaf_nodes"]
# param_range = [range(3,15,1), range(2,10,1), range(1,3,1), range(10,50,2)]
# dtc_parameter(param, param_range, X, Y)

# for abc
# param = ['n_estimators', 'learning_rate']
# param_range = [range(50,70,4), [0.1 * i for i in range(5,10,1)]]
# abc_parameter(param, param_range, X, Y)