return (mse(y_t, y_p))**0.5 data_list = [ 'All Districts', 'Cluster 0', 'Cluster 1', 'Cluster 2', 'Cluster 3', 'Cluster 4', 'Cluster 5', 'Ariyalur', 'Chennai', 'Coimbatore', 'Cuddalore', 'Dharmapuri', 'Dindigul', 'Erode', 'Kancheepuram', 'Karur', 'Madurai', 'Nagapattinam', 'Namakkal', 'Perambalur', 'Pudukkottai', 'Ramanathapuram', 'Salem', 'Sivaganga', 'Thanjavur', 'Theni', 'The Nilgiris', 'Thiruvallur', 'Thiruvarur', 'Thoothukkudi', 'Tiruchirapalli', 'Tirunelveli', 'Tiruvannamalai', 'Vellore', 'Viluppuram', 'Virudhunagar' ] parameters = pd.read_csv( 'C:\\Users\\Preetham G\\Documents\\Research Projects\\Forecast of Rainfall Quantity and its variation using Envrionmental Features\\Results\\Parameters\\Parameters.csv' ) rkf = rkf(n_splits=10, n_repeats=10) #columns for result dl = [] m = [] mse_ts = [] rmse_ts = [] mae_ts = [] mdae_ts = [] evs_ts = [] r2_ts = [] #iterating through datas method = [ 'Multiple Linear Regression', 'Support Vector Regression', 'Decision Tree Regression', 'Polynomial Regression' ] for i in data_list:
from sklearn.model_selection import RepeatedKFold as rkf from sklearn.model_selection import train_test_split as tts DATA_FOLDER = r"E:\ML Projects\Tennessee Eastman\TE_process_dataset" NUM_FILES = 22 training_files = [ os.path.join(DATA_FOLDER, "d{:0>2}.dat".format(i)) for i in range(NUM_FILES) ] test_files = [ os.path.join(DATA_FOLDER, "d{:0>2}_te.dat".format(i)) for i in range(NUM_FILES) ] training_data = [ pd.read_csv(f, sep=' ', index_col=None) for f in training_files ] test_data = [pd.read_csv(f, sep=' ') for f in test_files] # ============================================================================= # Due to the relatively small training data (490 samples per fault) I think # a K-fold strategy is necessary per fault. For that matter, I think # splitting the training data to include a validation set in conjunction with # the KFold may be a wise move to avoid data leakage by tuning on the given # test set. # ============================================================================= seed = 10 split_size = .30 rkf = rkf(n_splits=5, n_repeats=10, random_state=seed)