Exemple #1
0
from sklearn.model_selection import learning_curve
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
import pandas as pd  #数据分析
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
from util import set_Cabin_type, set_missing_ages, plot_learning_curve, one_hot_encoding
# (1) 读取数据集
data_train = pd.read_csv("data/train.csv")

# (2) 特征工程 - 处理缺失值
data_train, rfr = set_missing_ages(data_train)
data_train = set_Cabin_type(data_train)

# (3) 特特工程 - 类目型的特征离散/因子化
df = one_hot_encoding(data_train)
# select specific coloumn
train_df = df.filter(
    regex=
    'Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*'
)
#print(train_df.describe())
train_np = train_df.as_matrix()
# y即Survival结果
y = train_np[:, 0]
# X即特征属性值
X = train_np[:, 1:]

from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(2)
print(X.shape)
test_data_file_path = 'data/test.csv'

train_lable, raw_train_data = util.group_by_visit_number(train_data_file_path)
test_label, raw_test_data = util.group_by_visit_number(test_data_file_path, False)

# feature to be tested with
feature_set = [['d'], ['s', 'd'], ['f'], ['s', 'f']]
feature_result = []

for feature in feature_set:
    # filter the data with the required feature
    pro_train_data = util.process_data(raw_train_data, feature)
    pro_test_data = util.process_data(raw_test_data, feature)

    bag_of_features = util.get_feature_bag(pro_train_data, pro_test_data, {})
    train_data = util.one_hot_encoding(pro_train_data, bag_of_features)
    train_lable = np.array(train_lable)

    # 5-fold cross validation
    num_train_data = len(train_data)
    num_fold = 5
    step_size = num_train_data / num_fold
    result = []
    for i in range(0, num_fold):
        start_index = i * step_size
        end_index = (i + 1) * step_size

        train_mask = np.ones(num_train_data, dtype=bool)
        test_mask = np.zeros(num_train_data, dtype=bool)
        train_mask[start_index:end_index] = np.zeros(step_size, dtype=bool)
        test_mask[start_index:end_index] = np.ones(step_size, dtype=bool)
Exemple #3
0
    X_train, y_train = train['features'], train['labels']
    X_test, y_test = test['features'], test['labels']

    X_train_transformed = np.zeros_like(X_train)
    y_train_transformed = np.zeros_like(y_train)
    for i in range(X_train_transformed.shape[0]):
        X_train_transformed[i] = util.transform_image(X_train[i], 20, 10, 5)
        y_train_transformed[i] = y_train[i]

    X_train = np.vstack((X_train, X_train_transformed))
    y_train = np.hstack((y_train, y_train_transformed))
    y_train = y_train.astype(int)

    X_train_centered = util.min_max_normalization(X_train)
    X_test_centered = util.min_max_normalization(X_test)
    y_train, y_test = util.one_hot_encoding(y_train, y_test)
    train_features, dev_features, train_labels, dev_labels = util.train_dev_split(X_train_centered, y_train, 0.1)

    training_dataset = util.DataSet(train_features, train_labels)
    dev_dataset = util.DataSet(dev_features, dev_labels)
    testing_dataset = util.DataSet(X_test_centered, y_test)

    saver = tf.train.Saver()
    best_dev_acc = 1e-10

    with tf.Session() as sess:
        sess.run(tf.initialize_all_variables())
        steps_per_epoch = len(train_features) // BATCH_SIZE
        num_examples = steps_per_epoch * BATCH_SIZE

        training_accuracies = []
Exemple #4
0
train_lable, raw_train_data = util.group_by_visit_number(train_data_file_path)
test_label, raw_test_data = util.group_by_visit_number(test_data_file_path,
                                                       False)

# feature to be tested with
feature_set = [['d'], ['s', 'd'], ['f'], ['s', 'f']]
feature_result = []

for feature in feature_set:
    # filter the data with the required feature
    pro_train_data = util.process_data(raw_train_data, feature)
    pro_test_data = util.process_data(raw_test_data, feature)

    bag_of_features = util.get_feature_bag(pro_train_data, pro_test_data, {})
    train_data = util.one_hot_encoding(pro_train_data, bag_of_features)
    train_lable = np.array(train_lable)

    # 5-fold cross validation
    num_train_data = len(train_data)
    num_fold = 5
    step_size = num_train_data / num_fold
    result = []
    for i in range(0, num_fold):
        start_index = i * step_size
        end_index = (i + 1) * step_size

        train_mask = np.ones(num_train_data, dtype=bool)
        test_mask = np.zeros(num_train_data, dtype=bool)
        train_mask[start_index:end_index] = np.zeros(step_size, dtype=bool)
        test_mask[start_index:end_index] = np.ones(step_size, dtype=bool)
train_lable, raw_train_data = util.group_by_visit_number(train_data_file_path)
test_label, raw_test_data = util.group_by_visit_number(test_data_file_path, False)

# feature to be tested with
feature_set = [['d']]
feature_result = []

for feature in feature_set:
    # filter the data with the required feature
    pro_train_data = util.process_data(raw_train_data, feature)
    pro_test_data = util.process_data(raw_test_data, feature)

    bag_of_features = util.get_feature_bag(pro_train_data, pro_test_data, {})
    train_data = util.one_hot_encoding(
        pro_train_data,
        bag_of_features,
        verbose=False,
        numerical=False)
    train_lable = np.array(train_lable)

    # 5-fold cross validation
    num_train_data = len(train_data)
    num_fold = 5
    step_size = num_train_data / num_fold
    result = []
    for i in range(0, num_fold):
        start_index = i * step_size
        end_index = (i + 1) * step_size

        train_mask = np.ones(num_train_data, dtype=bool)
        test_mask = np.zeros(num_train_data, dtype=bool)