Beispiel #1
0
def main():
    print('---------- Section 1 ----------')
    # Create an instance of the data class which will store the csv data in a dataframe
    plant_data = Data('nuclear_plants.csv')

    # Normalise data to scale values
    # plant_data.norm_data()
    print(f'Size of data: {plant_data.get_size()}')
    print(f'Data Types: \n{plant_data.get_data_type()}')
    print(
        f'Number of Samples for each Category: {plant_data.get_cat_count("Status")}'
    )

    print(f"Mean: \n{plant_data.get_mean()}")
    print(f"Standard Deviation: \n{plant_data.get_stan_dev()}")
    print(f"Minimum: \n{plant_data.get_min()}")
    print(f"Maximum: \n{plant_data.get_max()}")
    print(f"Median: \n{plant_data.get_median()}")
    print(plant_data.get_missing_value_count())
    print(plant_data.get_feature_count())
    print(f"Variance: \n{plant_data.get_variance()}")

    #plt = Plot()
    #print(plt.data_box_plot(plant_data.data, 'Status', 'Vibration_sensor_1'), pylab.show())
    #print(plt.data_density_plot(plant_data.data, 'Status', 'Vibration_sensor_2'), pylab.show())

    # Standardise the data
    plant_data.stand_data()
    #print(plant_data.data.head())

    # Convert status column to categorical data
    plant_data.cat_to_num('Status')

    print('---------- Section 3 ----------')
    # Split data into train and test based on target variable Status
    train_x, train_y, test_x, test_y = plant_data.split_data(0.9, 'Status')

    # # for e in epochs:
    # models = Models(train_x, train_y, test_x, test_y)

    # # Create Neural Network
    # models.create_nn_model(500, 0.0001)

    # # Train Nerual Network with 500 nodes and 2 hidden layers
    # models.train_nn(True, 150)

    # # Apply test data to NN
    # acc = models.test_nn()
    # print(f'NN Testing Accuracy: {acc}')

    # Create Random forest with 1000 trees and 5 or 50 leaf nodes
    # models.create_rf_model(5)

    # Apply test data to random forest

    #plt.tree_count_plot(nodes, train_acc, test_acc)

    print('---------- Section 4 ----------')
def test_use_dummies_normalize():
    dummies_data = Data("data/train.csv", use_dummies=True, normalize=True)
    X_train, y_train = data.processing(["revenue"])
    X_test = data.processing_test_data()
    assert X_train.shape[1] == X_test.shape[1]

    X_train, y_train = data.processing(["revenue"])
    X_test = data.processing_test_data("data/train.csv")
    assert np.array_equal(X_train, X_test)
Beispiel #3
0
#%%
from utils import *
from utils.metrics import regression_report
from data_processing import Data, evaluate_by_label, fill_label

import pandas as pd
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor

if __name__ == "__main__":
    # data
    data = Data(use_dummies=False, normalize=False)
    X_train_df, X_test_df, y_train_df, y_test_df = data.train_test_split_by_date(
        ["revenue"], test_ratio=0.3)
    X_train, X_test, y_train, y_test = (
        X_train_df.to_numpy(),
        X_test_df.to_numpy(),
        y_train_df["revenue"].to_numpy(),
        y_test_df["revenue"].to_numpy(),
    )
    print(f"X_train shape {X_train.shape}, y_train shape {y_train.shape}")
    print(f"X_test shape {X_test.shape}, y_test shape {y_test.shape}")

    #%% evaluate performance with training data
    eval_reg = HistGradientBoostingRegressor(random_state=1129)
    eval_reg.fit(X_train, y_train)

    print("-" * 10, "regression report", "-" * 10)
    report = regression_report(y_test, eval_reg.predict(X_test),
                               X_test.shape[1])
    print(report)
    pred_df = X_test_df.copy()
    pred_df["pred_revenue"] = revenue_pred
    pred_label_df = data.to_label(pred_df)
    true_label_df = pd.read_csv("data/revenue_per_day.csv",
                                index_col="arrival_date")

    report.append("[ label evaluation ]")
    report.append(evaluate_by_label2(pred_label_df, true_label_df, "label"))
    report.append("[ revenue_per_day evaluation ]")
    report.append(evaluate_by_label2(pred_label_df, true_label_df, "revenue"))
    return "\n".join(report) + "\n"


#%% data
data = Data(use_dummies=False, normalize=False)
X_train_df, X_test_df, y_train_df, y_test_df = data.train_test_split_by_date(
    ["revenue", "is_canceled", "adr"], test_ratio=0.3)
print(f"X_train shape {X_train_df.shape}, y_train shape {y_train_df.shape}")
print(f"X_test shape {X_test_df.shape}, y_test shape {y_test_df.shape}")

report = main(
    HistGradientBoostingRegressor,
    X_train_df,
    X_test_df,
    y_train_df,
    y_test_df,
    nsplit=2,
)
clfs, regs = get_models()
print(report)
#%%
from utils import *
from data_processing import Data

# start from here!
if __name__ == "__main__":
    data = Data(use_dummies=False, normalize=False)
    # test classifiers
    X_df, y_df = data.processing(["is_canceled"])
    mlmodelwrapper = MLModelWrapper(X_df.to_numpy(), y_df.to_numpy())
    mlmodelwrapper.quick_test("classifier")

    # test regressors
    X_df, y_df = data.processing(["adr"])
    mlmodelwrapper = MLModelWrapper(X_df.to_numpy(), y_df.to_numpy())
    mlmodelwrapper.quick_test("regressor")

    X_df, y_df = data.processing(["revenue"])
    mlmodelwrapper = MLModelWrapper(X_df.to_numpy(), y_df.to_numpy())
    mlmodelwrapper.quick_test("regressor")
# %%
from utils import *
from data_processing import Data

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

#%%
X_np, y_np = Data().processing(target="is_canceled",
                               use_dummies=False,
                               normalize=False)
print(f"X_np's shape: {X_np.shape}")
print(f"y_np's shape: {y_np.shape}")

train_loader, val_loader, test_loader = LoadData(
    X_y=(X_np, y_np),
    X_y_dtype=("float", "float")).get_dataloader([0.7, 0.2, 0.1],
                                                 batch_size=64)

# %% start from here!
if __name__ == "__main__":
    # setting
    model = BinaryClassificationModel(X_np.shape[1])
    loss_func = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    modelwrapper = ModelWrapper(model, loss_func, optimizer)

    # training
    model = modelwrapper.train(train_loader, val_loader, max_epochs=50)
        )
        # print(f"X_train shape: {X_train_df.shape}, y_train shape: {y_train_df.shape}")
        # print(f"X_test shape: {X_test_df.shape}, y_test shape: {y_test_df.shape}")
        X_train_df, X_test_df = X_test_df, X_train_df
        y_train_df, y_test_df = y_test_df, y_train_df
        reg, models = cross_train(estimator_class, X_train_df, X_test_df,
                                  y_train_df, y_test_df)
        regs.append((reg, models))

    return regs


nsplit = 3
regressor = HistGradientBoostingRegressor
if __name__ == "__main__":
    data = Data(use_dummies=False, normalize=False)
    X_train_df, X_test_df, y_train_df, y_test_df = data.train_test_split_by_date(
        ["revenue", "is_canceled", "adr"], test_ratio=0.3)
    print(
        f"X_train shape {X_train_df.shape}, y_train shape {y_train_df.shape}")
    print(f"X_test shape {X_test_df.shape}, y_test shape {y_test_df.shape}")

    # data
    X_train, X_test, y_train, y_test = (
        X_train_df.to_numpy(),
        X_test_df.to_numpy(),
        y_train_df["revenue"].to_numpy(),
        y_test_df["revenue"].to_numpy(),
    )
    # training
    regs = split_train(regressor, X_train_df, y_train_df, nsplit)
# %%
from utils import *
from data_processing import Data

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

#%%
data = Data(use_dummies=False)
X_df, y_df = data.processing(target="reservation_status")
X_np, y_np = X_df.to_numpy(), y_df.to_numpy()
reservation_status_cats = data.get_y_cats()
print(f"X_np's shape: {X_np.shape}")
print(f"y_np's shape: {y_np.shape}")

train_loader, val_loader, test_loader = LoadData(
    X_y=(X_np, y_np),
    X_y_dtype=("float", "long")).get_dataloader([0.7, 0.2, 0.1], batch_size=64)

# %% start from here!
if __name__ == "__main__":
    # setting
    model = Input1DModel(X_np.shape[1], len(reservation_status_cats))
    loss_func = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    modelwrapper = ModelWrapper(model, loss_func, optimizer)

    # training
    model = modelwrapper.train(train_loader, val_loader, max_epochs=50)
        report += evaluate(self.model, self.X_df, self.label_df)

        print(report)
        if self.save:
            print(f"*Append result to SKLearn_{self.model_type}s_Report.txt")
            with open(f"SKLearn_{self.model_type}s_Report.txt", "a") as ofile:
                if self.name != None:
                    ofile.write(f"Method: {self.name}\n")
                ofile.write(f"finished time: {datetime.now()}\n")
                ofile.write(report)
                ofile.write("-" * 20 + "\n")
        print("-" * 20)


# test classifiers
data = Data(use_dummies=False, normalize=False)
X_train_df, X_test_df, y_train_df, y_test_df = data.train_test_split_by_date(
    "revenue", test_ratio=0.3)
X_train, X_test, y_train, y_test = (
    X_train_df.to_numpy(),
    X_test_df.to_numpy(),
    y_train_df.to_numpy(),
    y_test_df.to_numpy(),
)
print(f"X_train shape {X_train.shape}")
print(f"X_test shape {X_train.shape}")
print(f"y_train shape {y_train.shape}")
print(f"y_test shape {y_test.shape}")

print(X_test_df.shape)
print(X_train.shape)
from data_processing import Data

import numpy as np

data = Data("data/train.csv")


# test processing_test_data & processing give same output
def test_processing_test_data():
    X1, y1 = data.processing(["revenue"])
    X2 = data.processing_test_data("data/train.csv")
    assert np.array_equal(X1, X2)


# test features amount are same
def test_test_train_features():
    X_train, y_train = data.processing(["revenue"])
    X_test = data.processing_test_data()
    assert X_train.shape[1] == X_test.shape[1]


# test use use_dummies
def test_use_dummies():
    dummies_data = Data("data/train.csv", use_dummies=True)
    X_train, y_train = data.processing(["revenue"])
    X_test = data.processing_test_data()
    assert X_train.shape[1] == X_test.shape[1]

    X_train, y_train = data.processing(["revenue"])
    X_test = data.processing_test_data("data/train.csv")
    assert np.array_equal(X_train, X_test)
Beispiel #11
0
 def __init__(self):
     self.data = Data()  # Contains id, text, label variables
    adr_pred = reg.predict(X_df.to_numpy())
    canceled_pred = clf.predict_proba(X_df.to_numpy())
    canceled_pred = canceled_pred[:, 1]

    pred_df = X_df.copy()
    pred_df["pred_actual_adr"] = adr_pred * (1 - canceled_pred)
    pred_df["pred_revenue"] = (
        pred_df["stays_in_weekend_nights"] +
        pred_df["stays_in_week_nights"]) * pred_df["pred_actual_adr"]

    return pred_df


if __name__ == "__main__":
    # data
    data = Data(use_dummies=True, normalize=False)
    X_train_df, X_test_df, y_train_df, y_test_df = data.train_test_split_by_date(
        ["adr", "is_canceled"], test_ratio=0.3)
    train_df = pd.concat([X_train_df, y_train_df], axis=1)
    created_df = data.duplicate_data((2015, 6, 1), (2016, 3, 31), ratio=1)
    # created_df = data.create_data((2016, 6, 1), (2017, 3, 31), ratio=1, offset=5)
    augmented_df = pd.concat([train_df, created_df[train_df.columns]], axis=0)
    y_train_df = augmented_df[["adr", "is_canceled"]]
    X_train_df = augmented_df.drop(["adr", "is_canceled"], axis=1)

    X_train, X_test, y_train_adr, y_test_adr, y_train_canceled, y_test_canceled = (
        X_train_df.to_numpy(),
        X_test_df.to_numpy(),
        y_train_df["adr"].to_numpy(),
        y_test_df["adr"].to_numpy(),
        y_train_df["is_canceled"].to_numpy(),
Beispiel #13
0
# 计算mse
def get_mse(list1: List, list2: List):
    length = len(list1)
    mse = 0
    for i in range(length):
        mse += (list1[i] - list2[i]) * (list1[i] - list2[i])
    mse /= length
    return mse


if __name__ == '__main__':
    time1 = time.time()
    # data = Data("data/test.txt")
    # data = Data("./data/bank-additional-full.csv")
    data = Data("./data/kosarak.dat")


    # print(data.true_p)
    # data.show_data_information()
    # epsilon = 1
    # xn = [1, 2, 3, 4, 5, 6, 7, 8, 9]
    # xs = [10, 11, 12]
    # sue = SUE(epsilon, data.domain, data.data)
    # usue = USUE(epsilon, data.domain, data.data, xs, xn)
    # sue_mse = 0
    # usue_mse = 0
    # sue.run()
    # usue.run()
    # usue.estimate(usue.per_data)
    # for _ in range(100):
# %%
from utils import *
from data_processing import Data

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import pandas as pd

#%%
data = Data(use_dummies=False, normalize=False)
X_df, y_df = data.processing(["actual_adr"])
X_np, y_np = X_df.to_numpy(), y_df.to_numpy()
print(f"X_np's shape: {X_np.shape}")
print(f"y_np's shape: {y_np.shape}")

#%%
train_loader, val_loader, test_loader = LoadData(
    X_y=(X_np, y_np),
    X_y_dtype=("float", "float")).get_dataloader([0.65, 0.15, 0.2],
                                                 batch_size=128)

# %% start from here!
if __name__ == "__main__":
    # setting
    model = Input1DModelSimplified(X_np.shape[1], 1)
    loss_func = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    modelwrapper = ModelWrapper(model, loss_func, optimizer)
import unittest
from data_processing import Data

data = Data()


class MyTestCase(unittest.TestCase):
    def test_stemming(self):
        """
        Test whether stemming works correctly
        :return: None
        """
        pre_stem = data.train_tweet[0]
        data.stem()
        post_stem = data.train_tweet[0]
        self.assertNotEqual(len(pre_stem), len(post_stem))
        self.assertNotEqual(pre_stem, post_stem, "STEM: Successful")

    def test_bigrams(self):
        "Test that bigrams is instantiated correctly"
        bigrams = data.bigrams(data.train_tweet)
        sample_frame = bigrams[0]
        self.assertTrue(type(sample_frame[0]) is tuple)
        self.assertEqual(len(sample_frame[0]), 2)


if __name__ == '__main__':
    unittest.main()
#%%
from utils import *
from utils.metrics import regression_report
from data_processing import Data, evaluate_by_label, fill_label

import pandas as pd
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor

# data
data = Data(use_dummies=False, normalize=False)
X_train_df, X_test_df, y_train_df, y_test_df = data.train_test_split_by_date(
    ["actual_adr"], test_ratio=0.3)
train_df = pd.concat([X_train_df, y_train_df], axis=1)
created_df = data.create_data((2016, 9, 1), (2017, 3, 31), ratio=0.1, offset=5)
augmented_df = pd.concat([train_df, created_df[train_df.columns]], axis=0)
y_train_df = augmented_df[["actual_adr"]]
X_train_df = augmented_df.drop(["actual_adr"], axis=1)

#%%
X_train, X_test, y_train, y_test = (
    X_train_df.to_numpy(),
    X_test_df.to_numpy(),
    y_train_df["actual_adr"].to_numpy(),
    y_test_df["actual_adr"].to_numpy(),
)
print(f"X_train shape {X_train.shape}, y_train shape {y_train.shape}")
print(f"X_test shape {X_test.shape}, y_test shape {y_test.shape}")

#%% evaluate performance with training data
eval_reg = HistGradientBoostingRegressor(random_state=1126)