def main(): print('---------- Section 1 ----------') # Create an instance of the data class which will store the csv data in a dataframe plant_data = Data('nuclear_plants.csv') # Normalise data to scale values # plant_data.norm_data() print(f'Size of data: {plant_data.get_size()}') print(f'Data Types: \n{plant_data.get_data_type()}') print( f'Number of Samples for each Category: {plant_data.get_cat_count("Status")}' ) print(f"Mean: \n{plant_data.get_mean()}") print(f"Standard Deviation: \n{plant_data.get_stan_dev()}") print(f"Minimum: \n{plant_data.get_min()}") print(f"Maximum: \n{plant_data.get_max()}") print(f"Median: \n{plant_data.get_median()}") print(plant_data.get_missing_value_count()) print(plant_data.get_feature_count()) print(f"Variance: \n{plant_data.get_variance()}") #plt = Plot() #print(plt.data_box_plot(plant_data.data, 'Status', 'Vibration_sensor_1'), pylab.show()) #print(plt.data_density_plot(plant_data.data, 'Status', 'Vibration_sensor_2'), pylab.show()) # Standardise the data plant_data.stand_data() #print(plant_data.data.head()) # Convert status column to categorical data plant_data.cat_to_num('Status') print('---------- Section 3 ----------') # Split data into train and test based on target variable Status train_x, train_y, test_x, test_y = plant_data.split_data(0.9, 'Status') # # for e in epochs: # models = Models(train_x, train_y, test_x, test_y) # # Create Neural Network # models.create_nn_model(500, 0.0001) # # Train Nerual Network with 500 nodes and 2 hidden layers # models.train_nn(True, 150) # # Apply test data to NN # acc = models.test_nn() # print(f'NN Testing Accuracy: {acc}') # Create Random forest with 1000 trees and 5 or 50 leaf nodes # models.create_rf_model(5) # Apply test data to random forest #plt.tree_count_plot(nodes, train_acc, test_acc) print('---------- Section 4 ----------')
def test_use_dummies_normalize(): dummies_data = Data("data/train.csv", use_dummies=True, normalize=True) X_train, y_train = data.processing(["revenue"]) X_test = data.processing_test_data() assert X_train.shape[1] == X_test.shape[1] X_train, y_train = data.processing(["revenue"]) X_test = data.processing_test_data("data/train.csv") assert np.array_equal(X_train, X_test)
#%% from utils import * from utils.metrics import regression_report from data_processing import Data, evaluate_by_label, fill_label import pandas as pd from sklearn.experimental import enable_hist_gradient_boosting from sklearn.ensemble import HistGradientBoostingRegressor if __name__ == "__main__": # data data = Data(use_dummies=False, normalize=False) X_train_df, X_test_df, y_train_df, y_test_df = data.train_test_split_by_date( ["revenue"], test_ratio=0.3) X_train, X_test, y_train, y_test = ( X_train_df.to_numpy(), X_test_df.to_numpy(), y_train_df["revenue"].to_numpy(), y_test_df["revenue"].to_numpy(), ) print(f"X_train shape {X_train.shape}, y_train shape {y_train.shape}") print(f"X_test shape {X_test.shape}, y_test shape {y_test.shape}") #%% evaluate performance with training data eval_reg = HistGradientBoostingRegressor(random_state=1129) eval_reg.fit(X_train, y_train) print("-" * 10, "regression report", "-" * 10) report = regression_report(y_test, eval_reg.predict(X_test), X_test.shape[1]) print(report)
pred_df = X_test_df.copy() pred_df["pred_revenue"] = revenue_pred pred_label_df = data.to_label(pred_df) true_label_df = pd.read_csv("data/revenue_per_day.csv", index_col="arrival_date") report.append("[ label evaluation ]") report.append(evaluate_by_label2(pred_label_df, true_label_df, "label")) report.append("[ revenue_per_day evaluation ]") report.append(evaluate_by_label2(pred_label_df, true_label_df, "revenue")) return "\n".join(report) + "\n" #%% data data = Data(use_dummies=False, normalize=False) X_train_df, X_test_df, y_train_df, y_test_df = data.train_test_split_by_date( ["revenue", "is_canceled", "adr"], test_ratio=0.3) print(f"X_train shape {X_train_df.shape}, y_train shape {y_train_df.shape}") print(f"X_test shape {X_test_df.shape}, y_test shape {y_test_df.shape}") report = main( HistGradientBoostingRegressor, X_train_df, X_test_df, y_train_df, y_test_df, nsplit=2, ) clfs, regs = get_models() print(report)
#%% from utils import * from data_processing import Data # start from here! if __name__ == "__main__": data = Data(use_dummies=False, normalize=False) # test classifiers X_df, y_df = data.processing(["is_canceled"]) mlmodelwrapper = MLModelWrapper(X_df.to_numpy(), y_df.to_numpy()) mlmodelwrapper.quick_test("classifier") # test regressors X_df, y_df = data.processing(["adr"]) mlmodelwrapper = MLModelWrapper(X_df.to_numpy(), y_df.to_numpy()) mlmodelwrapper.quick_test("regressor") X_df, y_df = data.processing(["revenue"]) mlmodelwrapper = MLModelWrapper(X_df.to_numpy(), y_df.to_numpy()) mlmodelwrapper.quick_test("regressor")
# %% from utils import * from data_processing import Data import torch import torch.nn as nn import torch.optim as optim import torch.nn.functional as F #%% X_np, y_np = Data().processing(target="is_canceled", use_dummies=False, normalize=False) print(f"X_np's shape: {X_np.shape}") print(f"y_np's shape: {y_np.shape}") train_loader, val_loader, test_loader = LoadData( X_y=(X_np, y_np), X_y_dtype=("float", "float")).get_dataloader([0.7, 0.2, 0.1], batch_size=64) # %% start from here! if __name__ == "__main__": # setting model = BinaryClassificationModel(X_np.shape[1]) loss_func = nn.BCELoss() optimizer = optim.Adam(model.parameters(), lr=0.001) modelwrapper = ModelWrapper(model, loss_func, optimizer) # training model = modelwrapper.train(train_loader, val_loader, max_epochs=50)
) # print(f"X_train shape: {X_train_df.shape}, y_train shape: {y_train_df.shape}") # print(f"X_test shape: {X_test_df.shape}, y_test shape: {y_test_df.shape}") X_train_df, X_test_df = X_test_df, X_train_df y_train_df, y_test_df = y_test_df, y_train_df reg, models = cross_train(estimator_class, X_train_df, X_test_df, y_train_df, y_test_df) regs.append((reg, models)) return regs nsplit = 3 regressor = HistGradientBoostingRegressor if __name__ == "__main__": data = Data(use_dummies=False, normalize=False) X_train_df, X_test_df, y_train_df, y_test_df = data.train_test_split_by_date( ["revenue", "is_canceled", "adr"], test_ratio=0.3) print( f"X_train shape {X_train_df.shape}, y_train shape {y_train_df.shape}") print(f"X_test shape {X_test_df.shape}, y_test shape {y_test_df.shape}") # data X_train, X_test, y_train, y_test = ( X_train_df.to_numpy(), X_test_df.to_numpy(), y_train_df["revenue"].to_numpy(), y_test_df["revenue"].to_numpy(), ) # training regs = split_train(regressor, X_train_df, y_train_df, nsplit)
# %% from utils import * from data_processing import Data import torch import torch.nn as nn import torch.optim as optim import torch.nn.functional as F #%% data = Data(use_dummies=False) X_df, y_df = data.processing(target="reservation_status") X_np, y_np = X_df.to_numpy(), y_df.to_numpy() reservation_status_cats = data.get_y_cats() print(f"X_np's shape: {X_np.shape}") print(f"y_np's shape: {y_np.shape}") train_loader, val_loader, test_loader = LoadData( X_y=(X_np, y_np), X_y_dtype=("float", "long")).get_dataloader([0.7, 0.2, 0.1], batch_size=64) # %% start from here! if __name__ == "__main__": # setting model = Input1DModel(X_np.shape[1], len(reservation_status_cats)) loss_func = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=0.001) modelwrapper = ModelWrapper(model, loss_func, optimizer) # training model = modelwrapper.train(train_loader, val_loader, max_epochs=50)
report += evaluate(self.model, self.X_df, self.label_df) print(report) if self.save: print(f"*Append result to SKLearn_{self.model_type}s_Report.txt") with open(f"SKLearn_{self.model_type}s_Report.txt", "a") as ofile: if self.name != None: ofile.write(f"Method: {self.name}\n") ofile.write(f"finished time: {datetime.now()}\n") ofile.write(report) ofile.write("-" * 20 + "\n") print("-" * 20) # test classifiers data = Data(use_dummies=False, normalize=False) X_train_df, X_test_df, y_train_df, y_test_df = data.train_test_split_by_date( "revenue", test_ratio=0.3) X_train, X_test, y_train, y_test = ( X_train_df.to_numpy(), X_test_df.to_numpy(), y_train_df.to_numpy(), y_test_df.to_numpy(), ) print(f"X_train shape {X_train.shape}") print(f"X_test shape {X_train.shape}") print(f"y_train shape {y_train.shape}") print(f"y_test shape {y_test.shape}") print(X_test_df.shape) print(X_train.shape)
from data_processing import Data import numpy as np data = Data("data/train.csv") # test processing_test_data & processing give same output def test_processing_test_data(): X1, y1 = data.processing(["revenue"]) X2 = data.processing_test_data("data/train.csv") assert np.array_equal(X1, X2) # test features amount are same def test_test_train_features(): X_train, y_train = data.processing(["revenue"]) X_test = data.processing_test_data() assert X_train.shape[1] == X_test.shape[1] # test use use_dummies def test_use_dummies(): dummies_data = Data("data/train.csv", use_dummies=True) X_train, y_train = data.processing(["revenue"]) X_test = data.processing_test_data() assert X_train.shape[1] == X_test.shape[1] X_train, y_train = data.processing(["revenue"]) X_test = data.processing_test_data("data/train.csv") assert np.array_equal(X_train, X_test)
def __init__(self): self.data = Data() # Contains id, text, label variables
adr_pred = reg.predict(X_df.to_numpy()) canceled_pred = clf.predict_proba(X_df.to_numpy()) canceled_pred = canceled_pred[:, 1] pred_df = X_df.copy() pred_df["pred_actual_adr"] = adr_pred * (1 - canceled_pred) pred_df["pred_revenue"] = ( pred_df["stays_in_weekend_nights"] + pred_df["stays_in_week_nights"]) * pred_df["pred_actual_adr"] return pred_df if __name__ == "__main__": # data data = Data(use_dummies=True, normalize=False) X_train_df, X_test_df, y_train_df, y_test_df = data.train_test_split_by_date( ["adr", "is_canceled"], test_ratio=0.3) train_df = pd.concat([X_train_df, y_train_df], axis=1) created_df = data.duplicate_data((2015, 6, 1), (2016, 3, 31), ratio=1) # created_df = data.create_data((2016, 6, 1), (2017, 3, 31), ratio=1, offset=5) augmented_df = pd.concat([train_df, created_df[train_df.columns]], axis=0) y_train_df = augmented_df[["adr", "is_canceled"]] X_train_df = augmented_df.drop(["adr", "is_canceled"], axis=1) X_train, X_test, y_train_adr, y_test_adr, y_train_canceled, y_test_canceled = ( X_train_df.to_numpy(), X_test_df.to_numpy(), y_train_df["adr"].to_numpy(), y_test_df["adr"].to_numpy(), y_train_df["is_canceled"].to_numpy(),
# 计算mse def get_mse(list1: List, list2: List): length = len(list1) mse = 0 for i in range(length): mse += (list1[i] - list2[i]) * (list1[i] - list2[i]) mse /= length return mse if __name__ == '__main__': time1 = time.time() # data = Data("data/test.txt") # data = Data("./data/bank-additional-full.csv") data = Data("./data/kosarak.dat") # print(data.true_p) # data.show_data_information() # epsilon = 1 # xn = [1, 2, 3, 4, 5, 6, 7, 8, 9] # xs = [10, 11, 12] # sue = SUE(epsilon, data.domain, data.data) # usue = USUE(epsilon, data.domain, data.data, xs, xn) # sue_mse = 0 # usue_mse = 0 # sue.run() # usue.run() # usue.estimate(usue.per_data) # for _ in range(100):
# %% from utils import * from data_processing import Data import torch import torch.nn as nn import torch.optim as optim import torch.nn.functional as F import pandas as pd #%% data = Data(use_dummies=False, normalize=False) X_df, y_df = data.processing(["actual_adr"]) X_np, y_np = X_df.to_numpy(), y_df.to_numpy() print(f"X_np's shape: {X_np.shape}") print(f"y_np's shape: {y_np.shape}") #%% train_loader, val_loader, test_loader = LoadData( X_y=(X_np, y_np), X_y_dtype=("float", "float")).get_dataloader([0.65, 0.15, 0.2], batch_size=128) # %% start from here! if __name__ == "__main__": # setting model = Input1DModelSimplified(X_np.shape[1], 1) loss_func = nn.MSELoss() optimizer = optim.Adam(model.parameters(), lr=0.001) modelwrapper = ModelWrapper(model, loss_func, optimizer)
import unittest from data_processing import Data data = Data() class MyTestCase(unittest.TestCase): def test_stemming(self): """ Test whether stemming works correctly :return: None """ pre_stem = data.train_tweet[0] data.stem() post_stem = data.train_tweet[0] self.assertNotEqual(len(pre_stem), len(post_stem)) self.assertNotEqual(pre_stem, post_stem, "STEM: Successful") def test_bigrams(self): "Test that bigrams is instantiated correctly" bigrams = data.bigrams(data.train_tweet) sample_frame = bigrams[0] self.assertTrue(type(sample_frame[0]) is tuple) self.assertEqual(len(sample_frame[0]), 2) if __name__ == '__main__': unittest.main()
#%% from utils import * from utils.metrics import regression_report from data_processing import Data, evaluate_by_label, fill_label import pandas as pd from sklearn.experimental import enable_hist_gradient_boosting from sklearn.ensemble import HistGradientBoostingRegressor # data data = Data(use_dummies=False, normalize=False) X_train_df, X_test_df, y_train_df, y_test_df = data.train_test_split_by_date( ["actual_adr"], test_ratio=0.3) train_df = pd.concat([X_train_df, y_train_df], axis=1) created_df = data.create_data((2016, 9, 1), (2017, 3, 31), ratio=0.1, offset=5) augmented_df = pd.concat([train_df, created_df[train_df.columns]], axis=0) y_train_df = augmented_df[["actual_adr"]] X_train_df = augmented_df.drop(["actual_adr"], axis=1) #%% X_train, X_test, y_train, y_test = ( X_train_df.to_numpy(), X_test_df.to_numpy(), y_train_df["actual_adr"].to_numpy(), y_test_df["actual_adr"].to_numpy(), ) print(f"X_train shape {X_train.shape}, y_train shape {y_train.shape}") print(f"X_test shape {X_test.shape}, y_test shape {y_test.shape}") #%% evaluate performance with training data eval_reg = HistGradientBoostingRegressor(random_state=1126)