def prepare_data \ ( brewery_name:list=["Epic Ales"] , review_aroma:list=[1] , review_appearance:list=[1] , review_palate:list=[1] , review_taste:list=[1] , si_path:str="./models/encoders/si_handle_nan_brewery_name.joblib" , oe_path:str="./models/encoders/oe_numericify_brewery_name.joblib" , sc_path:str="./models/encoders/sc_scale_features.joblib" ): # Imports from src.utils import assertions as a from joblib import load import pandas as pd import numpy as np # Assertions assert a.all_list([brewery_name, review_aroma, review_appearance, review_palate, review_taste]) assert a.all_str(brewery_name) assert all([a.all_float_or_int(param) for param in [review_aroma, review_appearance, review_palate, review_taste]]) assert a.all_str([si_path, oe_path, sc_path]) assert a.all_valid_path([si_path, oe_path, sc_path]) # Loads si = load(si_path) oe = load(oe_path) sc = load(sc_path) # Transform brewery_name brewery_name = np.array(brewery_name, dtype="object").reshape(-1,1) brewery_name = si.transform(brewery_name) brewery_name = oe.transform(brewery_name) brewery_name = brewery_name.flatten() # Make pd.DataFrame data = pd.DataFrame ( \ { "brewery_name": brewery_name , "review_aroma": review_aroma , "review_appearance": review_appearance , "review_palate": review_palate , "review_taste": review_taste }) # Scale features data = sc.transform(data[["brewery_name", "review_aroma", "review_appearance", "review_palate", "review_taste"]]) # Return return data
def decode_predictions(data:torch.Tensor, decoder_path:str="./models/encoders/le_numericify_beer_style.joblib"): # Imports from src.utils import assertions as a import torch import numpy as np from joblib import load # Assertions assert isinstance(data, torch.Tensor) assert a.all_str(decoder_path) # Make numpy nump = data.numpy() # Get index of predicted value nump = np.argmax(nump, axis=1) # Reshape to 2D array nump = nump.reshape(-1,1) # Load decoder decoder = load(decoder_path) # Get label labl = decoder.inverse_transform(nump) # Return return labl
def scale_features(feat: pd.DataFrame, cols: list = None, transform: bool = True): # Imports from src.utils import assertions as a from sklearn.preprocessing import StandardScaler # Assertions assert a.all_dataframe(feat) if cols: if isinstance(cols, str): cols = [cols] assert a.all_str(cols) assert a.all_in(cols, feat.columns) assert a.all_bool(transform) # Get cols if not cols: cols = feat.columns # Instantiations sc = StandardScaler() # Do work if transform: feat[cols] = sc.fit_transform(feat[cols]) else: sc.fit(feat[cols]) # Return return feat, sc
def make_si(feat: pd.DataFrame, cols: list, transform: bool = True): # Imports from src.utils import assertions as a from sklearn.impute import SimpleImputer import numpy as np # Assertions assert a.all_dataframe(feat) if isinstance(cols, str): cols = [cols] assert a.all_str(cols) assert a.all_in(cols, feat.columns) assert a.all_bool(transform) # Instantiations si = SimpleImputer(missing_values=np.nan, strategy="constant", fill_value="Other") # Do work if transform: feat[cols] = si.fit_transform(feat[cols]) else: si.fit(feat[cols]) # Return return feat, si
def make_ohe(feat: pd.DataFrame, cols: list, transform: bool = True): # Imports from src.utils import assertions as a from sklearn.preprocessing import OneHotEncoder # Assertions assert a.all_dataframe(feat) assert a.all_str(cols) if isinstance(cols, str): cols = [cols] assert a.all_in(cols, feat.columns) assert a.all_bool(transform) # Instantiations ohe = OneHotEncoder(sparse=False) # Do work data = feat[cols] if transform: data = ohe.fit_transform(data) data = pd.DataFrame(data) data.columns = ohe.get_feature_names(cols) feat.drop(cols, axis=1, inplace=True) feat = pd.concat([feat, data], axis=1) else: ohe.fit(data) # Return return feat, ohe
def get_unzip_data(source_file: str, target_dir: str, delete_source: bool = True): # Imports from src.utils import assertions as a import os from zipfile import ZipFile # Assertions assert a.all_str([source_file, target_dir]) assert a.all_bool([delete_source]) assert all([os.path.exists(param) for param in [source_file, target_dir]]) # Do work try: with ZipFile(source_file, "r") as z: z.extractall(target_dir) except: raise NotImplementedError("Could not extract files from Zip folder.") # Delete source if delete_source: os.remove(source_file) # Return return True
def pop_target(data:pd.DataFrame, targ:str): """ Pop the target column off the data set. Args: data (pd.DataFrame): The data set, from which the target will be removed. targ (str): The name of the feature to be removed. Must be a valid and existing column in `data`. Returns: data (pd.DataFrame): The updated `data` object, having had the `feat` column removed. targ (pd.Series): The feature that has been removed from the `data` object. """ # Imports from src.utils import assertions as a # Assertions assert a.all_dataframe(data) assert a.all_str(targ) assert targ in data.columns # Do work # targ = data.pop(targ) feat = data.drop([targ], axis=1) targ = data[[targ]] # Return return feat, targ
def rem_features(data:pd.DataFrame, feats:list): # Imports from src.utils import assertions as a # Assertions assert a.all_dataframe(data) assert a.all_str(feats, (str, list)) assert a.all_str(feats) assert a.all_in(feats, data.columns) # Do work data = data.drop(columns=feats) # Return return data
def get_file_data(url: str, save_path: str = "./data/external", save_name: str = "raw_data.tmp", chunk_size: int = 128): """ Import the data from a given URL, and save to a directory chunk by chunk. Inspiration for this function came from: https://stackoverflow.com/questions/9419162/download-returned-zip-file-from-url#answer-9419208 Args: url (str): The URL from which the data will be downloaded. save_path (str, optional): The directory to which the data will be saved. Defaults to "./data/external". save_name (str, optional): The name of the file that will be saved. Defaults to "raw_data.tmp". chunk_size (int, optional): The chunk size of the data to be downloaded. Defaults to 128. Raises: ImportError: If there is an error with calling the API at any stage. Assertions: If any of the parameters are not the correct type or attribute. Returns: bool: Returns `True` if the Import+Export was successful """ # Imports from src.utils import assertions as a import os import requests from src.utils.misc import valid_url # Assertions assert a.all_str([url, save_path, save_name]) assert a.all_int([chunk_size]) assert valid_url(url) assert os.path.exists(save_path) # Get data try: response = requests.get(url, stream=True) response.raise_for_status() except requests.exceptions.HTTPError as http_err: raise ImportError("Http Error: " + http_err) except requests.exceptions.ConnectionError as conn_err: raise ImportError("Connection Error: " + conn_err) except requests.exceptions.Timeout as time_err: raise ImportError("Timeout Error: " + time_err) except requests.exceptions.RequestException as excp_err: raise ImportError("Other Exception Error: " + excp_err) except Exception as err: raise ImportError("Unknown error occurred: " + err) # Set file path target = os.path.join(save_path, save_name) # Save data with open(target, 'wb') as fd: for chunk in response.iter_content(chunk_size=chunk_size): fd.write(chunk) return True
def predict_single \ ( brewery_name:str="Epic Ales" , review_aroma:float=1 , review_appearance:float=1 , review_palate:float=1 , review_taste:float=1 , modl_path:str="./models/predictors/beer_prediction.pth" ): # Imports from src.utils import assertions as a from src.models.predict import prepare_data, predict_classification, decode_predictions from src.models.pytorch import Modl, get_device import torch # Assertions assert a.all_str(brewery_name) assert all([a.all_float_or_int(param) for param in [review_aroma, review_appearance, review_palate, review_taste]]) assert a.all_str(modl_path) assert a.all_valid_path(modl_path) # Loads modl = Modl() modl.load(model_path=modl_path) # modl.load_state_dict(torch.load(modl_path, map_location=get_device())) # Prepare data data = prepare_data \ ( brewery_name=brewery_name , review_aroma=review_aroma , review_appearance=review_appearance , review_palate=review_palate , review_taste=review_taste ) # Predict data data = predict_classification(data, modl) # Decode data data = decode_predictions(data) # Flatten data = data.flatten() return data
def encode_features(feat: pd.DataFrame, cols=list, type: str = "ordinal", transform: bool = True): # Imports from src.utils import assertions as a from src.data.prep_data import make_oe, make_ohe # Assertions assert a.all_dataframe(feat) assert isinstance(cols, (str, list)) assert a.all_str(cols) assert a.all_str(type) # Do work & return if type in ["oe", "ord", "ordinal", "ordinalencoder", "ordinal encoder"]: return make_oe(feat=feat, cols=cols, transform=transform) elif type in ["ohe", "one", "onehotencoder", "one hot encoder"]: return make_ohe(feat=feat, cols=cols, transform=transform) elif type in ["le", "label", "label encoder", "labelencoder"]: return make_le(feat=feat, cols=cols, transform=transform) else: return feat, None
def read(path:str): # Imports from src.utils import assertions as a # Assertions assert a.all_str(path) assert a.all_valid_path(path) # Read with open(path, "rt") as f: data = f.read() # Return return data
def sel_feat_cols(data:pd.DataFrame, feats:list): # Imports from src.utils import assertions as a # Assertions assert a.all_dataframe(data) assert isinstance(feats, (str, list)) assert a.all_str(feats) assert a.all_in(feats, data.columns) # Do work data = data[feats] return data
def dump_data(data: any, path: str = "./data/raw", name: str = "None", suffix: str = "joblib"): """ Dump the data from memory to a file. Args: data (any): The data to be dumped. Can be any type. path (str, optional): The directory where the data should be dumped to. Defaults to "./data/raw". name (str, optional): The name of the file that should be to. Defaults to "None". suffix (str, optional): The suffix of the file that will be dumped to. Defaults to "joblib". Raises: NotImplementedError: If the dumping fails for any reason. Returns: None: If successful, then nothing should be returned. """ # Imports from src.utils import assertions as a import os from joblib import dump # Assertions assert a.all_str([path, name, suffix]) assert os.path.exists(path) # Join output name output = os.path.join(path, name) + "." + suffix # Do the dumping try: dump(data, output) except: raise NotImplementedError("Could not dump the data") return None
def make_le(feat: pd.DataFrame, cols: list, transform: bool = True): # Imports from src.utils import assertions as a from sklearn.preprocessing import LabelEncoder # Assertions assert a.all_dataframe(feat) assert a.all_str(cols) if isinstance(cols, str): cols = [cols] assert a.all_in(cols, feat.columns) assert a.all_bool(transform) # Instantiations le = LabelEncoder() # Do work if transform: feat[cols] = le.fit_transform(feat[cols]) else: le.fit(feat[cols]) # Return return feat, le
def train_overall_network \ ( feat_trn:np.real , targ_trn:np.real , feat_val:np.real , targ_val:np.real , hidden_shapes:list=[20,20,20] , hidden_acti:str="relu" , final_shape:int=1 , final_acti:str="sigmoid" , batch_size:int=100 , epochs:int=500 , learning_rate:float=0.001 , device:torch.device=get_device() , scheduler:bool=True , verbosity:int=10 , plot_learning:bool=True ): # Imports import numpy as np from src.utils import assertions as a from src.models.pytorch import PyTorchDataset from torch import nn, optim from src.models.pytorch import Net # Assertions assert a.all_real([feat_trn, targ_trn, feat_val, targ_val]) assert isinstance(hidden_shapes, list) assert len(hidden_shapes)>0, "Must have at least 1 hidden layer" assert a.all_in(hidden_shapes) assert a.all_scalar([hidden_acti, final_shape, final_acti, batch_size, epochs, learning_rate]) assert isinstance(verbosity, (int, type(None))) assert a.all_int([batch_size, epochs, verbosity]) assert a.all_str([hidden_acti, final_acti]) assert a.all_float(learning_rate) # Initialise data generators data_trn = PyTorchDataset(feat_trn, targ_trn) data_val = PyTorchDataset(feat_val, targ_val) # Initialise classes modl = Net(feat_trn.shape[1], len(set(targ_trn))) crit = nn.CrossEntropyLoss() optm = optim.Adam(modl.parameters(), lr=learning_rate) if scheduler: scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer=optm, mode="min", patience=3) # Push network to device modl.to(device) # Set dumping ground costs = {"epoch": [], "loss_trn": [], "accu_trn": [], "loss_val": [], "accu_val": []} loss_trn = 0.0 accu_trn = 0.0 # Loop over epochs for epoch in range(epochs): loss_trn, accu_trn = model_train \ ( data_trn=data_trn , modl=modl , crit=crit , optm=optm , batch_size=batch_size , hidden_shapes=hidden_shapes , hidden_acti=hidden_acti , final_shape=final_shape , final_acti=final_acti , device=device , scheduler=scheduler ) loss_val, accu_val = model_validate \ ( data_val=data_val , modl=modl , crit=crit , batch_size=batch_size , hidden_shapes=hidden_shapes , hidden_acti=hidden_acti , final_shape=final_shape , final_acti=final_acti , device=device ) # Record progress costs["epoch"].append(epoch+1) costs["loss_trn"].append(loss_trn) costs["accu_trn"].append(accu_trn) costs["loss_val"].append(loss_val) costs["accu_val"].append(accu_val) # Adjust scheduler if scheduler: scheduler.step() # Print stats if verbosity: if epoch % verbosity == 0 or epoch+1==epochs: # Plot learning if plot_learning: plot_network_training(costs) # Print metrics # print("Epoch: {}/{}\tLoss: {:.5f}".format(costs["epoch"][-1], epochs, costs["trn_los"][-1])) # Return return modl