def cv(data, target, multivariant=False): X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(data, target, test_size=0.2, random_state=0) if multivariant is False: linear_regression(X_train, X_test, Y_train, Y_test, plot=False) else: linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='Ridge_Regression')
def cv(data, target, multivariant=False): X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(data, target, test_size=0.1, random_state=10) if multivariant is False: linear_regression(X_train, X_test, Y_train, Y_test, plot=False) else: linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun="ordinary_least_squares") linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun="Ridge_Regression") linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun="Bayesian_Regression") linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun="SVR") linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun="KNN_Reg")
def avg(training_file, submission_file, output_file): data = utilities.read_file(training_file) train_data, cv_data = preprocess.get_train_cv_data_by_chunk(data) targets_train, targets_cv = preprocess.get_train_cv_targets( train_data, cv_data) (chunk_avg, hour_avg_by_chunk, weekday_avg_by_chunk, hour_avg, weekday_avg) = feature_extraction.get_avg_maps(train_data) x_train_all, x_cv_all = feature_extraction.get_x_by_avg( train_data, cv_data, chunk_avg, hour_avg_by_chunk, weekday_avg_by_chunk, hour_avg, weekday_avg) clfs = regression.linear_regression( x_train_all, x_cv_all, targets_train, targets_cv) clfs = regression.random_forest( x_train_all, x_cv_all, targets_train, targets_cv) print 'Filling submission file...' sub_data = utilities.read_file(submission_file, True) for i in range(1, len(sub_data)): chunk_id = sub_data[i][1] hour = sub_data[i][3] weekday = '' all_features = feature_extraction.get_features( chunk_id, weekday, hour, chunk_avg, hour_avg_by_chunk, weekday_avg_by_chunk, hour_avg, weekday_avg) for j in range(5, len(sub_data[i])): if sub_data[i][j] == '0': feature = [] for f in all_features: feature.append(f[j - 5]) sub_data[i][j] = clfs[j - 5].predict([feature])[0] utilities.write_file(output_file, sub_data)
import regression as reg import load_data as ld ld.create_sample('airfoil_self_noise_.csv','new_file.csv', 10) ld.createSets('new_file.csv', 75) b_vector = reg.linear_regression('train_set.csv') error = reg.test("test_set.csv", b_vector)
#print(data_list[i][0:-2]) character.append(data_list[i][:-1]) label.append(data_list[i][-1]) return np.array(character), np.array(label) if __name__ == '__main__': file_path = '/Users/yuqishi/Documents/machine_learning/data/波士顿房价数据集/housing_data.txt' split = 450 #获得训练样本和测试样本 train_data, test_data = get_Data(file_path, split) #获得训练样本的特征和标签数组 train_X, train_Y = split_data(train_data) #获得测试数组X的特征和标签数组 test_X, test_Y = split_data(test_data) #将训练集中的数据训练求得w参数值 w = linear_regression(train_X, train_Y) print(predict(test_X, test_Y, w))