def make_si(feat: pd.DataFrame, cols: list, transform: bool = True): # Imports from src.utils import assertions as a from sklearn.impute import SimpleImputer import numpy as np # Assertions assert a.all_dataframe(feat) if isinstance(cols, str): cols = [cols] assert a.all_str(cols) assert a.all_in(cols, feat.columns) assert a.all_bool(transform) # Instantiations si = SimpleImputer(missing_values=np.nan, strategy="constant", fill_value="Other") # Do work if transform: feat[cols] = si.fit_transform(feat[cols]) else: si.fit(feat[cols]) # Return return feat, si
def make_ohe(feat: pd.DataFrame, cols: list, transform: bool = True): # Imports from src.utils import assertions as a from sklearn.preprocessing import OneHotEncoder # Assertions assert a.all_dataframe(feat) assert a.all_str(cols) if isinstance(cols, str): cols = [cols] assert a.all_in(cols, feat.columns) assert a.all_bool(transform) # Instantiations ohe = OneHotEncoder(sparse=False) # Do work data = feat[cols] if transform: data = ohe.fit_transform(data) data = pd.DataFrame(data) data.columns = ohe.get_feature_names(cols) feat.drop(cols, axis=1, inplace=True) feat = pd.concat([feat, data], axis=1) else: ohe.fit(data) # Return return feat, ohe
def scale_features(feat: pd.DataFrame, cols: list = None, transform: bool = True): # Imports from src.utils import assertions as a from sklearn.preprocessing import StandardScaler # Assertions assert a.all_dataframe(feat) if cols: if isinstance(cols, str): cols = [cols] assert a.all_str(cols) assert a.all_in(cols, feat.columns) assert a.all_bool(transform) # Get cols if not cols: cols = feat.columns # Instantiations sc = StandardScaler() # Do work if transform: feat[cols] = sc.fit_transform(feat[cols]) else: sc.fit(feat[cols]) # Return return feat, sc
def plot_network_training(metrics:dict): # Imports from IPython.display import clear_output import numpy as np import matplotlib.pyplot as plt from src.utils import assertions as a # Assertions assert isinstance(metrics, dict) assert a.all_in(["accu_trn", "loss_trn", "accu_val", "loss_val"], list(metrics.keys())) # If only 1 score, then end epoch = len(next(iter(metrics.values()))) if epoch < 2: return None # Clearn previous output clear_output(wait=True) # Define space N = np.arange(1, len(next(iter(metrics.values())))+1) # You can chose the style of your preference # print(plt.style.available) to see the available options #plt.style.use("seaborn") # Plot train loss, train acc, val loss and val acc against epochs passed plt.figure(figsize=(8,8)) # Accuracy plt.subplot(2,1,1) plt.plot(N, metrics.get("accu_trn"), label = "Training Accuracy") plt.plot(N, metrics.get("accu_val"), label = "Validation Accuracy") plt.legend(loc="best") plt.title("Accuracy [Epoch {}]".format(epoch)) plt.ylim([0,1.1]) plt.ylabel("Accuracy") # Loss plt.subplot(2,1,2) plt.plot(N, metrics.get("loss_trn"), label = "Training Loss") plt.plot(N, metrics.get("loss_val"), label = "Validation Loss") plt.legend(loc="best") plt.title("Loss [Epoch {}]".format(epoch)) plt.ylabel("Loss") plt.xlabel("Epoch #") # Show plt.show()
def sel_feat_cols(data:pd.DataFrame, feats:list): # Imports from src.utils import assertions as a # Assertions assert a.all_dataframe(data) assert isinstance(feats, (str, list)) assert a.all_str(feats) assert a.all_in(feats, data.columns) # Do work data = data[feats] return data
def rem_features(data:pd.DataFrame, feats:list): # Imports from src.utils import assertions as a # Assertions assert a.all_dataframe(data) assert a.all_str(feats, (str, list)) assert a.all_str(feats) assert a.all_in(feats, data.columns) # Do work data = data.drop(columns=feats) # Return return data
def make_le(feat: pd.DataFrame, cols: list, transform: bool = True): # Imports from src.utils import assertions as a from sklearn.preprocessing import LabelEncoder # Assertions assert a.all_dataframe(feat) assert a.all_str(cols) if isinstance(cols, str): cols = [cols] assert a.all_in(cols, feat.columns) assert a.all_bool(transform) # Instantiations le = LabelEncoder() # Do work if transform: feat[cols] = le.fit_transform(feat[cols]) else: le.fit(feat[cols]) # Return return feat, le
def train_overall_network \ ( feat_trn:np.real , targ_trn:np.real , feat_val:np.real , targ_val:np.real , hidden_shapes:list=[20,20,20] , hidden_acti:str="relu" , final_shape:int=1 , final_acti:str="sigmoid" , batch_size:int=100 , epochs:int=500 , learning_rate:float=0.001 , device:torch.device=get_device() , scheduler:bool=True , verbosity:int=10 , plot_learning:bool=True ): # Imports import numpy as np from src.utils import assertions as a from src.models.pytorch import PyTorchDataset from torch import nn, optim from src.models.pytorch import Net # Assertions assert a.all_real([feat_trn, targ_trn, feat_val, targ_val]) assert isinstance(hidden_shapes, list) assert len(hidden_shapes)>0, "Must have at least 1 hidden layer" assert a.all_in(hidden_shapes) assert a.all_scalar([hidden_acti, final_shape, final_acti, batch_size, epochs, learning_rate]) assert isinstance(verbosity, (int, type(None))) assert a.all_int([batch_size, epochs, verbosity]) assert a.all_str([hidden_acti, final_acti]) assert a.all_float(learning_rate) # Initialise data generators data_trn = PyTorchDataset(feat_trn, targ_trn) data_val = PyTorchDataset(feat_val, targ_val) # Initialise classes modl = Net(feat_trn.shape[1], len(set(targ_trn))) crit = nn.CrossEntropyLoss() optm = optim.Adam(modl.parameters(), lr=learning_rate) if scheduler: scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer=optm, mode="min", patience=3) # Push network to device modl.to(device) # Set dumping ground costs = {"epoch": [], "loss_trn": [], "accu_trn": [], "loss_val": [], "accu_val": []} loss_trn = 0.0 accu_trn = 0.0 # Loop over epochs for epoch in range(epochs): loss_trn, accu_trn = model_train \ ( data_trn=data_trn , modl=modl , crit=crit , optm=optm , batch_size=batch_size , hidden_shapes=hidden_shapes , hidden_acti=hidden_acti , final_shape=final_shape , final_acti=final_acti , device=device , scheduler=scheduler ) loss_val, accu_val = model_validate \ ( data_val=data_val , modl=modl , crit=crit , batch_size=batch_size , hidden_shapes=hidden_shapes , hidden_acti=hidden_acti , final_shape=final_shape , final_acti=final_acti , device=device ) # Record progress costs["epoch"].append(epoch+1) costs["loss_trn"].append(loss_trn) costs["accu_trn"].append(accu_trn) costs["loss_val"].append(loss_val) costs["accu_val"].append(accu_val) # Adjust scheduler if scheduler: scheduler.step() # Print stats if verbosity: if epoch % verbosity == 0 or epoch+1==epochs: # Plot learning if plot_learning: plot_network_training(costs) # Print metrics # print("Epoch: {}/{}\tLoss: {:.5f}".format(costs["epoch"][-1], epochs, costs["trn_los"][-1])) # Return return modl