コード例 #1
0
def prepare_data \
    ( brewery_name:list=["Epic Ales"]
    , review_aroma:list=[1]
    , review_appearance:list=[1]
    , review_palate:list=[1]
    , review_taste:list=[1]
    , si_path:str="./models/encoders/si_handle_nan_brewery_name.joblib"
    , oe_path:str="./models/encoders/oe_numericify_brewery_name.joblib"
    , sc_path:str="./models/encoders/sc_scale_features.joblib"
    ):
    
    # Imports
    from src.utils import assertions as a
    from joblib import load
    import pandas as pd
    import numpy as np
    
    # Assertions
    assert a.all_list([brewery_name, review_aroma, review_appearance, review_palate, review_taste])
    assert a.all_str(brewery_name)
    assert all([a.all_float_or_int(param) for param in [review_aroma, review_appearance, review_palate, review_taste]])
    assert a.all_str([si_path, oe_path, sc_path])
    assert a.all_valid_path([si_path, oe_path, sc_path])
    
    # Loads
    si = load(si_path)
    oe = load(oe_path)
    sc = load(sc_path)
    
    # Transform brewery_name
    brewery_name = np.array(brewery_name, dtype="object").reshape(-1,1)
    brewery_name = si.transform(brewery_name)
    brewery_name = oe.transform(brewery_name)
    brewery_name = brewery_name.flatten()
    
    # Make pd.DataFrame
    data = pd.DataFrame ( \
        { "brewery_name": brewery_name
        , "review_aroma": review_aroma
        , "review_appearance": review_appearance
        , "review_palate": review_palate
        , "review_taste": review_taste
        })
    
    # Scale features
    data = sc.transform(data[["brewery_name", "review_aroma", "review_appearance", "review_palate", "review_taste"]])
    
    # Return
    return data
コード例 #2
0
def decode_predictions(data:torch.Tensor, decoder_path:str="./models/encoders/le_numericify_beer_style.joblib"):
    
    # Imports
    from src.utils import assertions as a
    import torch
    import numpy as np
    from joblib import load
    
    # Assertions
    assert isinstance(data, torch.Tensor)
    assert a.all_str(decoder_path)
    
    # Make numpy
    nump = data.numpy()
    
    # Get index of predicted value
    nump = np.argmax(nump, axis=1)
    
    # Reshape to 2D array
    nump = nump.reshape(-1,1)
    
    # Load decoder
    decoder = load(decoder_path)
    
    # Get label
    labl = decoder.inverse_transform(nump)
    
    # Return
    return labl
コード例 #3
0
def scale_features(feat: pd.DataFrame,
                   cols: list = None,
                   transform: bool = True):

    # Imports
    from src.utils import assertions as a
    from sklearn.preprocessing import StandardScaler

    # Assertions
    assert a.all_dataframe(feat)
    if cols:
        if isinstance(cols, str): cols = [cols]
        assert a.all_str(cols)
        assert a.all_in(cols, feat.columns)
    assert a.all_bool(transform)

    # Get cols
    if not cols:
        cols = feat.columns

    # Instantiations
    sc = StandardScaler()

    # Do work
    if transform:
        feat[cols] = sc.fit_transform(feat[cols])
    else:
        sc.fit(feat[cols])

    # Return
    return feat, sc
コード例 #4
0
def make_si(feat: pd.DataFrame, cols: list, transform: bool = True):

    # Imports
    from src.utils import assertions as a
    from sklearn.impute import SimpleImputer
    import numpy as np

    # Assertions
    assert a.all_dataframe(feat)
    if isinstance(cols, str): cols = [cols]
    assert a.all_str(cols)
    assert a.all_in(cols, feat.columns)
    assert a.all_bool(transform)

    # Instantiations
    si = SimpleImputer(missing_values=np.nan,
                       strategy="constant",
                       fill_value="Other")

    # Do work
    if transform:
        feat[cols] = si.fit_transform(feat[cols])
    else:
        si.fit(feat[cols])

    # Return
    return feat, si
コード例 #5
0
def make_ohe(feat: pd.DataFrame, cols: list, transform: bool = True):

    # Imports
    from src.utils import assertions as a
    from sklearn.preprocessing import OneHotEncoder

    # Assertions
    assert a.all_dataframe(feat)
    assert a.all_str(cols)
    if isinstance(cols, str): cols = [cols]
    assert a.all_in(cols, feat.columns)
    assert a.all_bool(transform)

    # Instantiations
    ohe = OneHotEncoder(sparse=False)

    # Do work
    data = feat[cols]
    if transform:
        data = ohe.fit_transform(data)
        data = pd.DataFrame(data)
        data.columns = ohe.get_feature_names(cols)
        feat.drop(cols, axis=1, inplace=True)
        feat = pd.concat([feat, data], axis=1)
    else:
        ohe.fit(data)

    # Return
    return feat, ohe
コード例 #6
0
def get_unzip_data(source_file: str,
                   target_dir: str,
                   delete_source: bool = True):

    # Imports
    from src.utils import assertions as a
    import os
    from zipfile import ZipFile

    # Assertions
    assert a.all_str([source_file, target_dir])
    assert a.all_bool([delete_source])
    assert all([os.path.exists(param) for param in [source_file, target_dir]])

    # Do work
    try:
        with ZipFile(source_file, "r") as z:
            z.extractall(target_dir)
    except:
        raise NotImplementedError("Could not extract files from Zip folder.")

    # Delete source
    if delete_source:
        os.remove(source_file)

    # Return
    return True
コード例 #7
0
def pop_target(data:pd.DataFrame, targ:str):
    """
    Pop the target column off the data set.

    Args:
        data (pd.DataFrame): The data set, from which the target will be removed.
        targ (str): The name of the feature to be removed. Must be a valid and existing column in `data`.

    Returns:
        data (pd.DataFrame): The updated `data` object, having had the `feat` column removed.
        targ (pd.Series): The feature that has been removed from the `data` object.
    """
    
    # Imports
    from src.utils import assertions as a
    
    # Assertions
    assert a.all_dataframe(data)
    assert a.all_str(targ)
    assert targ in data.columns
    
    # Do work
    # targ = data.pop(targ)
    feat = data.drop([targ], axis=1)
    targ = data[[targ]]
    
    # Return
    return feat, targ
コード例 #8
0
def rem_features(data:pd.DataFrame, feats:list):
    
    # Imports
    from src.utils import assertions as a
    
    # Assertions
    assert a.all_dataframe(data)
    assert a.all_str(feats, (str, list))
    assert a.all_str(feats)
    assert a.all_in(feats, data.columns)
    
    # Do work
    data = data.drop(columns=feats)
    
    # Return
    return data
コード例 #9
0
def get_file_data(url: str,
                  save_path: str = "./data/external",
                  save_name: str = "raw_data.tmp",
                  chunk_size: int = 128):
    """
    Import the data from a given URL, and save to a directory chunk by chunk.
    Inspiration for this function came from: https://stackoverflow.com/questions/9419162/download-returned-zip-file-from-url#answer-9419208

    Args:
        url (str): The URL from which the data will be downloaded.
        save_path (str, optional): The directory to which the data will be saved. Defaults to "./data/external".
        save_name (str, optional): The name of the file that will be saved. Defaults to "raw_data.tmp".
        chunk_size (int, optional): The chunk size of the data to be downloaded. Defaults to 128.

    Raises:
        ImportError: If there is an error with calling the API at any stage.
        Assertions: If any of the parameters are not the correct type or attribute.

    Returns:
        bool: Returns `True` if the Import+Export was successful
    """

    # Imports
    from src.utils import assertions as a
    import os
    import requests
    from src.utils.misc import valid_url

    # Assertions
    assert a.all_str([url, save_path, save_name])
    assert a.all_int([chunk_size])
    assert valid_url(url)
    assert os.path.exists(save_path)

    # Get data
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()
    except requests.exceptions.HTTPError as http_err:
        raise ImportError("Http Error: " + http_err)
    except requests.exceptions.ConnectionError as conn_err:
        raise ImportError("Connection Error: " + conn_err)
    except requests.exceptions.Timeout as time_err:
        raise ImportError("Timeout Error: " + time_err)
    except requests.exceptions.RequestException as excp_err:
        raise ImportError("Other Exception Error: " + excp_err)
    except Exception as err:
        raise ImportError("Unknown error occurred: " + err)

    # Set file path
    target = os.path.join(save_path, save_name)

    # Save data
    with open(target, 'wb') as fd:
        for chunk in response.iter_content(chunk_size=chunk_size):
            fd.write(chunk)

    return True
コード例 #10
0
def predict_single \
    ( brewery_name:str="Epic Ales"
    , review_aroma:float=1
    , review_appearance:float=1
    , review_palate:float=1
    , review_taste:float=1
    , modl_path:str="./models/predictors/beer_prediction.pth"
    ):
    
    # Imports
    from src.utils import assertions as a
    from src.models.predict import prepare_data, predict_classification, decode_predictions
    from src.models.pytorch import Modl, get_device
    import torch
    
    # Assertions
    assert a.all_str(brewery_name)
    assert all([a.all_float_or_int(param) for param in [review_aroma, review_appearance, review_palate, review_taste]])
    assert a.all_str(modl_path)
    assert a.all_valid_path(modl_path)
    
    # Loads
    modl = Modl()
    modl.load(model_path=modl_path)
    # modl.load_state_dict(torch.load(modl_path, map_location=get_device()))
    
    # Prepare data
    data = prepare_data \
        ( brewery_name=brewery_name
        , review_aroma=review_aroma
        , review_appearance=review_appearance
        , review_palate=review_palate
        , review_taste=review_taste
        )
        
    # Predict data
    data = predict_classification(data, modl)
    
    # Decode data
    data = decode_predictions(data)
    
    # Flatten
    data = data.flatten()
        
    return data
コード例 #11
0
def encode_features(feat: pd.DataFrame,
                    cols=list,
                    type: str = "ordinal",
                    transform: bool = True):

    # Imports
    from src.utils import assertions as a
    from src.data.prep_data import make_oe, make_ohe

    # Assertions
    assert a.all_dataframe(feat)
    assert isinstance(cols, (str, list))
    assert a.all_str(cols)
    assert a.all_str(type)

    # Do work & return
    if type in ["oe", "ord", "ordinal", "ordinalencoder", "ordinal encoder"]:
        return make_oe(feat=feat, cols=cols, transform=transform)
    elif type in ["ohe", "one", "onehotencoder", "one hot encoder"]:
        return make_ohe(feat=feat, cols=cols, transform=transform)
    elif type in ["le", "label", "label encoder", "labelencoder"]:
        return make_le(feat=feat, cols=cols, transform=transform)
    else:
        return feat, None
コード例 #12
0
def read(path:str):
    
    # Imports
    from src.utils import assertions as a
    
    # Assertions
    assert a.all_str(path)
    assert a.all_valid_path(path)
    
    # Read
    with open(path, "rt") as f:
        data = f.read()
            
    # Return
    return data
コード例 #13
0
def sel_feat_cols(data:pd.DataFrame, feats:list):
    
    # Imports
    from src.utils import assertions as a
    
    # Assertions
    assert a.all_dataframe(data)
    assert isinstance(feats, (str, list))
    assert a.all_str(feats)
    assert a.all_in(feats, data.columns)
    
    # Do work
    data = data[feats]
    
    return data
コード例 #14
0
def dump_data(data: any,
              path: str = "./data/raw",
              name: str = "None",
              suffix: str = "joblib"):
    """
    Dump the data from memory to a file.

    Args:
        data (any): The data to be dumped. Can be any type.
        path (str, optional): The directory where the data should be dumped to. Defaults to "./data/raw".
        name (str, optional): The name of the file that should be to. Defaults to "None".
        suffix (str, optional): The suffix of the file that will be dumped to. Defaults to "joblib".

    Raises:
        NotImplementedError: If the dumping fails for any reason.

    Returns:
        None: If successful, then nothing should be returned.
    """

    # Imports
    from src.utils import assertions as a
    import os
    from joblib import dump

    # Assertions
    assert a.all_str([path, name, suffix])
    assert os.path.exists(path)

    # Join output name
    output = os.path.join(path, name) + "." + suffix

    # Do the dumping
    try:
        dump(data, output)
    except:
        raise NotImplementedError("Could not dump the data")

    return None
コード例 #15
0
def make_le(feat: pd.DataFrame, cols: list, transform: bool = True):

    # Imports
    from src.utils import assertions as a
    from sklearn.preprocessing import LabelEncoder

    # Assertions
    assert a.all_dataframe(feat)
    assert a.all_str(cols)
    if isinstance(cols, str): cols = [cols]
    assert a.all_in(cols, feat.columns)
    assert a.all_bool(transform)

    # Instantiations
    le = LabelEncoder()

    # Do work
    if transform:
        feat[cols] = le.fit_transform(feat[cols])
    else:
        le.fit(feat[cols])

    # Return
    return feat, le
コード例 #16
0
def train_overall_network \
    ( feat_trn:np.real
    , targ_trn:np.real
    , feat_val:np.real
    , targ_val:np.real
    , hidden_shapes:list=[20,20,20]
    , hidden_acti:str="relu"
    , final_shape:int=1
    , final_acti:str="sigmoid"
    , batch_size:int=100
    , epochs:int=500
    , learning_rate:float=0.001
    , device:torch.device=get_device()
    , scheduler:bool=True
    , verbosity:int=10
    , plot_learning:bool=True
    ):

    # Imports
    import numpy as np
    from src.utils import assertions as a
    from src.models.pytorch import PyTorchDataset
    from torch import nn, optim
    from src.models.pytorch import Net
    
    # Assertions
    assert a.all_real([feat_trn, targ_trn, feat_val, targ_val])
    assert isinstance(hidden_shapes, list)
    assert len(hidden_shapes)>0, "Must have at least 1 hidden layer"
    assert a.all_in(hidden_shapes)
    assert a.all_scalar([hidden_acti, final_shape, final_acti, batch_size, epochs, learning_rate])
    assert isinstance(verbosity, (int, type(None)))
    assert a.all_int([batch_size, epochs, verbosity])
    assert a.all_str([hidden_acti, final_acti])
    assert a.all_float(learning_rate)

    # Initialise data generators
    data_trn = PyTorchDataset(feat_trn, targ_trn)
    data_val = PyTorchDataset(feat_val, targ_val)

    # Initialise classes
    modl = Net(feat_trn.shape[1], len(set(targ_trn)))
    crit = nn.CrossEntropyLoss()
    optm = optim.Adam(modl.parameters(), lr=learning_rate)
    if scheduler:
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer=optm, mode="min", patience=3)

    # Push network to device
    modl.to(device)
    
    # Set dumping ground
    costs = {"epoch": [], "loss_trn": [], "accu_trn": [], "loss_val": [], "accu_val": []}
    loss_trn = 0.0
    accu_trn = 0.0

    # Loop over epochs
    for epoch in range(epochs):

        loss_trn, accu_trn = model_train \
            ( data_trn=data_trn
            , modl=modl
            , crit=crit
            , optm=optm
            , batch_size=batch_size
            , hidden_shapes=hidden_shapes
            , hidden_acti=hidden_acti
            , final_shape=final_shape
            , final_acti=final_acti
            , device=device
            , scheduler=scheduler
            )
        
        loss_val, accu_val = model_validate \
            ( data_val=data_val
            , modl=modl
            , crit=crit
            , batch_size=batch_size
            , hidden_shapes=hidden_shapes
            , hidden_acti=hidden_acti
            , final_shape=final_shape
            , final_acti=final_acti
            , device=device
            )

        # Record progress
        costs["epoch"].append(epoch+1)
        costs["loss_trn"].append(loss_trn)
        costs["accu_trn"].append(accu_trn)
        costs["loss_val"].append(loss_val)
        costs["accu_val"].append(accu_val)

        # Adjust scheduler
        if scheduler:
            scheduler.step()

        # Print stats
        if verbosity:
            if epoch % verbosity == 0 or epoch+1==epochs:
                # Plot learning
                if plot_learning:
                    plot_network_training(costs)
                # Print metrics
                # print("Epoch: {}/{}\tLoss: {:.5f}".format(costs["epoch"][-1], epochs, costs["trn_los"][-1]))

    # Return
    return modl