from sklearn.model_selection import learning_curve from sklearn import linear_model from sklearn.ensemble import RandomForestRegressor import pandas as pd #数据分析 from sklearn import linear_model from sklearn.ensemble import RandomForestClassifier from util import set_Cabin_type, set_missing_ages, plot_learning_curve, one_hot_encoding # (1) 读取数据集 data_train = pd.read_csv("data/train.csv") # (2) 特征工程 - 处理缺失值 data_train, rfr = set_missing_ages(data_train) data_train = set_Cabin_type(data_train) # (3) 特特工程 - 类目型的特征离散/因子化 df = one_hot_encoding(data_train) # select specific coloumn train_df = df.filter( regex= 'Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*' ) #print(train_df.describe()) train_np = train_df.as_matrix() # y即Survival结果 y = train_np[:, 0] # X即特征属性值 X = train_np[:, 1:] from sklearn.preprocessing import PolynomialFeatures poly = PolynomialFeatures(2) print(X.shape)
test_data_file_path = 'data/test.csv' train_lable, raw_train_data = util.group_by_visit_number(train_data_file_path) test_label, raw_test_data = util.group_by_visit_number(test_data_file_path, False) # feature to be tested with feature_set = [['d'], ['s', 'd'], ['f'], ['s', 'f']] feature_result = [] for feature in feature_set: # filter the data with the required feature pro_train_data = util.process_data(raw_train_data, feature) pro_test_data = util.process_data(raw_test_data, feature) bag_of_features = util.get_feature_bag(pro_train_data, pro_test_data, {}) train_data = util.one_hot_encoding(pro_train_data, bag_of_features) train_lable = np.array(train_lable) # 5-fold cross validation num_train_data = len(train_data) num_fold = 5 step_size = num_train_data / num_fold result = [] for i in range(0, num_fold): start_index = i * step_size end_index = (i + 1) * step_size train_mask = np.ones(num_train_data, dtype=bool) test_mask = np.zeros(num_train_data, dtype=bool) train_mask[start_index:end_index] = np.zeros(step_size, dtype=bool) test_mask[start_index:end_index] = np.ones(step_size, dtype=bool)
X_train, y_train = train['features'], train['labels'] X_test, y_test = test['features'], test['labels'] X_train_transformed = np.zeros_like(X_train) y_train_transformed = np.zeros_like(y_train) for i in range(X_train_transformed.shape[0]): X_train_transformed[i] = util.transform_image(X_train[i], 20, 10, 5) y_train_transformed[i] = y_train[i] X_train = np.vstack((X_train, X_train_transformed)) y_train = np.hstack((y_train, y_train_transformed)) y_train = y_train.astype(int) X_train_centered = util.min_max_normalization(X_train) X_test_centered = util.min_max_normalization(X_test) y_train, y_test = util.one_hot_encoding(y_train, y_test) train_features, dev_features, train_labels, dev_labels = util.train_dev_split(X_train_centered, y_train, 0.1) training_dataset = util.DataSet(train_features, train_labels) dev_dataset = util.DataSet(dev_features, dev_labels) testing_dataset = util.DataSet(X_test_centered, y_test) saver = tf.train.Saver() best_dev_acc = 1e-10 with tf.Session() as sess: sess.run(tf.initialize_all_variables()) steps_per_epoch = len(train_features) // BATCH_SIZE num_examples = steps_per_epoch * BATCH_SIZE training_accuracies = []
train_lable, raw_train_data = util.group_by_visit_number(train_data_file_path) test_label, raw_test_data = util.group_by_visit_number(test_data_file_path, False) # feature to be tested with feature_set = [['d'], ['s', 'd'], ['f'], ['s', 'f']] feature_result = [] for feature in feature_set: # filter the data with the required feature pro_train_data = util.process_data(raw_train_data, feature) pro_test_data = util.process_data(raw_test_data, feature) bag_of_features = util.get_feature_bag(pro_train_data, pro_test_data, {}) train_data = util.one_hot_encoding(pro_train_data, bag_of_features) train_lable = np.array(train_lable) # 5-fold cross validation num_train_data = len(train_data) num_fold = 5 step_size = num_train_data / num_fold result = [] for i in range(0, num_fold): start_index = i * step_size end_index = (i + 1) * step_size train_mask = np.ones(num_train_data, dtype=bool) test_mask = np.zeros(num_train_data, dtype=bool) train_mask[start_index:end_index] = np.zeros(step_size, dtype=bool) test_mask[start_index:end_index] = np.ones(step_size, dtype=bool)
train_lable, raw_train_data = util.group_by_visit_number(train_data_file_path) test_label, raw_test_data = util.group_by_visit_number(test_data_file_path, False) # feature to be tested with feature_set = [['d']] feature_result = [] for feature in feature_set: # filter the data with the required feature pro_train_data = util.process_data(raw_train_data, feature) pro_test_data = util.process_data(raw_test_data, feature) bag_of_features = util.get_feature_bag(pro_train_data, pro_test_data, {}) train_data = util.one_hot_encoding( pro_train_data, bag_of_features, verbose=False, numerical=False) train_lable = np.array(train_lable) # 5-fold cross validation num_train_data = len(train_data) num_fold = 5 step_size = num_train_data / num_fold result = [] for i in range(0, num_fold): start_index = i * step_size end_index = (i + 1) * step_size train_mask = np.ones(num_train_data, dtype=bool) test_mask = np.zeros(num_train_data, dtype=bool)