def create_matrix(d1: int, d2: int): pipeline_parameters_block = ''' d1 = {} d2 = {} '''.format(d1, d2) block1 = ''' import numpy as np ''' block2 = ''' rnd_matrix = np.random.rand(d1, d2) ''' data_saving_block = ''' # -----------------------DATA SAVING START--------------------------------- from kale.marshal import utils as _kale_marshal_utils _kale_marshal_utils.set_kale_data_directory("/marshal") _kale_marshal_utils.save(rnd_matrix, "rnd_matrix") # -----------------------DATA SAVING END----------------------------------- ''' # run the code blocks inside a jupyter kernel from kale.utils.jupyter_utils import run_code as _kale_run_code from kale.utils.jupyter_utils import update_uimetadata as _kale_update_uimetadata blocks = (pipeline_parameters_block, block1, block2, data_saving_block) html_artifact = _kale_run_code(blocks) with open("/create_matrix.html", "w") as f: f.write(html_artifact) _kale_update_uimetadata('create_matrix')
def test(): block1 = ''' v1 = "Hello" ''' block2 = ''' print(v1) ''' data_saving_block = ''' # -----------------------DATA SAVING START--------------------------------- from kale.marshal import utils as _kale_marshal_utils _kale_marshal_utils.set_kale_data_directory("") _kale_marshal_utils.save(v1, "v1") # -----------------------DATA SAVING END----------------------------------- ''' # run the code blocks inside a jupyter kernel from kale.utils.jupyter_utils import run_code as _kale_run_code from kale.utils.jupyter_utils import update_uimetadata as _kale_update_uimetadata blocks = (block1, block2, data_saving_block) html_artifact = _kale_run_code(blocks) with open("/test.html", "w") as f: f.write(html_artifact) _kale_update_uimetadata('test')
def sum_matrix(): data_loading_block = ''' # -----------------------DATA LOADING START-------------------------------- from kale.marshal import utils as _kale_marshal_utils _kale_marshal_utils.set_kale_data_directory("/marshal") _kale_marshal_utils.set_kale_directory_file_names() rnd_matrix = _kale_marshal_utils.load("rnd_matrix") # -----------------------DATA LOADING END---------------------------------- ''' block1 = ''' import numpy as np ''' block2 = ''' result = rnd_matrix.sum() ''' data_saving_block = ''' # -----------------------DATA SAVING START--------------------------------- from kale.marshal import utils as _kale_marshal_utils _kale_marshal_utils.set_kale_data_directory("/marshal") _kale_marshal_utils.save(result, "result") # -----------------------DATA SAVING END----------------------------------- ''' # run the code blocks inside a jupyter kernel from kale.utils.jupyter_utils import run_code as _kale_run_code from kale.utils.jupyter_utils import update_uimetadata as _kale_update_uimetadata blocks = (data_loading_block, block1, block2, data_saving_block) html_artifact = _kale_run_code(blocks) with open("/sum_matrix.html", "w") as f: f.write(html_artifact) _kale_update_uimetadata('sum_matrix')
def pipeline_metrics(d1: int, d2: int): pipeline_parameters_block = ''' d1 = {} d2 = {} '''.format(d1, d2) data_loading_block = ''' # -----------------------DATA LOADING START-------------------------------- from kale.marshal import utils as _kale_marshal_utils _kale_marshal_utils.set_kale_data_directory("/marshal") _kale_marshal_utils.set_kale_directory_file_names() result = _kale_marshal_utils.load("result") # -----------------------DATA LOADING END---------------------------------- ''' block1 = ''' import json metrics_metadata = list() metrics = { "d1": d1, "d2": d2, "result": result, } for k in metrics: if isinstance(metrics[k], (int, float)): metric = metrics[k] else: try: metric = float(metrics[k]) except ValueError: print("Variable {} with type {} not supported as pipeline" " metric. Can only write `int` or `float` types as" " pipeline metrics".format(k, type(k))) continue metrics_metadata.append({ 'name': k, 'numberValue': metric, 'format': "RAW", }) with open('/mlpipeline-metrics.json', 'w') as f: json.dump({'metrics': metrics_metadata}, f) ''' # run the code blocks inside a jupyter kernel from kale.utils.jupyter_utils import run_code as _kale_run_code from kale.utils.jupyter_utils import update_uimetadata as _kale_update_uimetadata blocks = ( pipeline_parameters_block, data_loading_block, block1, ) html_artifact = _kale_run_code(blocks) with open("/pipeline_metrics.html", "w") as f: f.write(html_artifact) _kale_update_uimetadata('pipeline_metrics')
def results(): data_loading_block = ''' # -----------------------DATA LOADING START-------------------------------- from kale.marshal import utils as _kale_marshal_utils _kale_marshal_utils.set_kale_data_directory("/marshal") _kale_marshal_utils.set_kale_directory_file_names() acc_decision_tree = _kale_marshal_utils.load("acc_decision_tree") acc_gaussian = _kale_marshal_utils.load("acc_gaussian") acc_linear_svc = _kale_marshal_utils.load("acc_linear_svc") acc_log = _kale_marshal_utils.load("acc_log") acc_random_forest = _kale_marshal_utils.load("acc_random_forest") # -----------------------DATA LOADING END---------------------------------- ''' block1 = ''' import numpy as np import pandas as pd import seaborn as sns from matplotlib import pyplot as plt from matplotlib import style from sklearn import linear_model from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import Perceptron from sklearn.linear_model import SGDClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.naive_bayes import GaussianNB ''' block2 = ''' results = pd.DataFrame({ 'Model': ['Support Vector Machines', 'logistic Regression', 'Random Forest', 'Naive Bayes', 'Decision Tree'], 'Score': [acc_linear_svc, acc_log, acc_random_forest, acc_gaussian, acc_decision_tree]}) result_df = results.sort_values(by='Score', ascending=False) result_df = result_df.set_index('Score') print(result_df) ''' # run the code blocks inside a jupyter kernel from kale.utils.jupyter_utils import run_code as _kale_run_code from kale.utils.jupyter_utils import update_uimetadata as _kale_update_uimetadata blocks = ( data_loading_block, block1, block2, ) html_artifact = _kale_run_code(blocks) with open("/results.html", "w") as f: f.write(html_artifact) _kale_update_uimetadata('results')
def test(): block1 = ''' print("hello") ''' # run the code blocks inside a jupyter kernel from kale.utils.jupyter_utils import run_code as _kale_run_code from kale.utils.jupyter_utils import update_uimetadata as _kale_update_uimetadata blocks = (block1, ) html_artifact = _kale_run_code(blocks) with open("/test.html", "w") as f: f.write(html_artifact) _kale_update_uimetadata('test')
def randomforest(): data_loading_block = ''' # -----------------------DATA LOADING START-------------------------------- from kale.marshal import utils as _kale_marshal_utils _kale_marshal_utils.set_kale_data_directory("/marshal") _kale_marshal_utils.set_kale_directory_file_names() train_df = _kale_marshal_utils.load("train_df") train_labels = _kale_marshal_utils.load("train_labels") # -----------------------DATA LOADING END---------------------------------- ''' block1 = ''' import numpy as np import pandas as pd import seaborn as sns from matplotlib import pyplot as plt from matplotlib import style from sklearn import linear_model from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import Perceptron from sklearn.linear_model import SGDClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.naive_bayes import GaussianNB ''' block2 = ''' random_forest = RandomForestClassifier(n_estimators=100) random_forest.fit(train_df, train_labels) acc_random_forest = round(random_forest.score(train_df, train_labels) * 100, 2) ''' data_saving_block = ''' # -----------------------DATA SAVING START--------------------------------- from kale.marshal import utils as _kale_marshal_utils _kale_marshal_utils.set_kale_data_directory("/marshal") _kale_marshal_utils.save(acc_random_forest, "acc_random_forest") # -----------------------DATA SAVING END----------------------------------- ''' # run the code blocks inside a jupyter kernel from kale.utils.jupyter_utils import run_code as _kale_run_code from kale.utils.jupyter_utils import update_uimetadata as _kale_update_uimetadata blocks = (data_loading_block, block1, block2, data_saving_block) html_artifact = _kale_run_code(blocks) with open("/randomforest.html", "w") as f: f.write(html_artifact) _kale_update_uimetadata('randomforest')
def loaddata(): block1 = ''' import numpy as np import pandas as pd import seaborn as sns from matplotlib import pyplot as plt from matplotlib import style from sklearn import linear_model from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import Perceptron from sklearn.linear_model import SGDClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.naive_bayes import GaussianNB ''' block2 = ''' path = "data/" PREDICTION_LABEL = 'Survived' test_df = pd.read_csv(path + "test.csv") train_df = pd.read_csv(path + "train.csv") ''' data_saving_block = ''' # -----------------------DATA SAVING START--------------------------------- from kale.marshal import utils as _kale_marshal_utils _kale_marshal_utils.set_kale_data_directory("/marshal") _kale_marshal_utils.save(PREDICTION_LABEL, "PREDICTION_LABEL") _kale_marshal_utils.save(test_df, "test_df") _kale_marshal_utils.save(train_df, "train_df") # -----------------------DATA SAVING END----------------------------------- ''' # run the code blocks inside a jupyter kernel from kale.utils.jupyter_utils import run_code as _kale_run_code from kale.utils.jupyter_utils import update_uimetadata as _kale_update_uimetadata blocks = (block1, block2, data_saving_block) html_artifact = _kale_run_code(blocks) with open("/loaddata.html", "w") as f: f.write(html_artifact) _kale_update_uimetadata('loaddata')
def test(): data_loading_block = ''' # -----------------------DATA LOADING START-------------------------------- from kale.marshal import utils as _kale_marshal_utils _kale_marshal_utils.set_kale_data_directory("") _kale_marshal_utils.set_kale_directory_file_names() v1 = _kale_marshal_utils.load("v1") # -----------------------DATA LOADING END---------------------------------- ''' # run the code blocks inside a jupyter kernel from kale.utils.jupyter_utils import run_code as _kale_run_code from kale.utils.jupyter_utils import update_uimetadata as _kale_update_uimetadata blocks = (data_loading_block, ) html_artifact = _kale_run_code(blocks) with open("/test.html", "w") as f: f.write(html_artifact) _kale_update_uimetadata('test')
def datapreprocessing(): data_loading_block = ''' # -----------------------DATA LOADING START-------------------------------- from kale.marshal import utils as _kale_marshal_utils _kale_marshal_utils.set_kale_data_directory("/marshal") _kale_marshal_utils.set_kale_directory_file_names() test_df = _kale_marshal_utils.load("test_df") train_df = _kale_marshal_utils.load("train_df") # -----------------------DATA LOADING END---------------------------------- ''' block1 = ''' import numpy as np import pandas as pd import seaborn as sns from matplotlib import pyplot as plt from matplotlib import style from sklearn import linear_model from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import Perceptron from sklearn.linear_model import SGDClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.naive_bayes import GaussianNB ''' block2 = ''' data = [train_df, test_df] for dataset in data: dataset['relatives'] = dataset['SibSp'] + dataset['Parch'] dataset.loc[dataset['relatives'] > 0, 'not_alone'] = 0 dataset.loc[dataset['relatives'] == 0, 'not_alone'] = 1 dataset['not_alone'] = dataset['not_alone'].astype(int) train_df['not_alone'].value_counts() ''' block3 = ''' # This does not contribute to a person survival probability train_df = train_df.drop(['PassengerId'], axis=1) ''' block4 = ''' import re deck = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "U": 8} data = [train_df, test_df] for dataset in data: dataset['Cabin'] = dataset['Cabin'].fillna("U0") dataset['Deck'] = dataset['Cabin'].map(lambda x: re.compile("([a-zA-Z]+)").search(x).group()) dataset['Deck'] = dataset['Deck'].map(deck) dataset['Deck'] = dataset['Deck'].fillna(0) dataset['Deck'] = dataset['Deck'].astype(int) # we can now drop the cabin feature train_df = train_df.drop(['Cabin'], axis=1) test_df = test_df.drop(['Cabin'], axis=1) ''' block5 = ''' data = [train_df, test_df] for dataset in data: mean = train_df["Age"].mean() std = test_df["Age"].std() is_null = dataset["Age"].isnull().sum() # compute random numbers between the mean, std and is_null rand_age = np.random.randint(mean - std, mean + std, size = is_null) # fill NaN values in Age column with random values generated age_slice = dataset["Age"].copy() age_slice[np.isnan(age_slice)] = rand_age dataset["Age"] = age_slice dataset["Age"] = train_df["Age"].astype(int) train_df["Age"].isnull().sum() ''' block6 = ''' train_df['Embarked'].describe() ''' block7 = ''' # fill with most common value common_value = 'S' data = [train_df, test_df] for dataset in data: dataset['Embarked'] = dataset['Embarked'].fillna(common_value) ''' block8 = ''' train_df.info() ''' data_saving_block = ''' # -----------------------DATA SAVING START--------------------------------- from kale.marshal import utils as _kale_marshal_utils _kale_marshal_utils.set_kale_data_directory("/marshal") _kale_marshal_utils.save(test_df, "test_df") _kale_marshal_utils.save(train_df, "train_df") # -----------------------DATA SAVING END----------------------------------- ''' # run the code blocks inside a jupyter kernel from kale.utils.jupyter_utils import run_code as _kale_run_code from kale.utils.jupyter_utils import update_uimetadata as _kale_update_uimetadata blocks = (data_loading_block, block1, block2, block3, block4, block5, block6, block7, block8, data_saving_block) html_artifact = _kale_run_code(blocks) with open("/datapreprocessing.html", "w") as f: f.write(html_artifact) _kale_update_uimetadata('datapreprocessing')
def featureengineering(): data_loading_block = ''' # -----------------------DATA LOADING START-------------------------------- from kale.marshal import utils as _kale_marshal_utils _kale_marshal_utils.set_kale_data_directory("/marshal") _kale_marshal_utils.set_kale_directory_file_names() PREDICTION_LABEL = _kale_marshal_utils.load("PREDICTION_LABEL") test_df = _kale_marshal_utils.load("test_df") train_df = _kale_marshal_utils.load("train_df") # -----------------------DATA LOADING END---------------------------------- ''' block1 = ''' import numpy as np import pandas as pd import seaborn as sns from matplotlib import pyplot as plt from matplotlib import style from sklearn import linear_model from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import Perceptron from sklearn.linear_model import SGDClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.naive_bayes import GaussianNB ''' block2 = ''' data = [train_df, test_df] for dataset in data: dataset['Fare'] = dataset['Fare'].fillna(0) dataset['Fare'] = dataset['Fare'].astype(int) ''' block3 = ''' data = [train_df, test_df] titles = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5} for dataset in data: # extract titles dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\\.', expand=False) # replace titles with a more common title or as Rare dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr',\\ 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare') dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss') dataset['Title'] = dataset['Title'].replace('Ms', 'Miss') dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs') # convert titles into numbers dataset['Title'] = dataset['Title'].map(titles) # filling NaN with 0, to get safe dataset['Title'] = dataset['Title'].fillna(0) train_df = train_df.drop(['Name'], axis=1) test_df = test_df.drop(['Name'], axis=1) ''' block4 = ''' genders = {"male": 0, "female": 1} data = [train_df, test_df] for dataset in data: dataset['Sex'] = dataset['Sex'].map(genders) ''' block5 = ''' train_df = train_df.drop(['Ticket'], axis=1) test_df = test_df.drop(['Ticket'], axis=1) ''' block6 = ''' ports = {"S": 0, "C": 1, "Q": 2} data = [train_df, test_df] for dataset in data: dataset['Embarked'] = dataset['Embarked'].map(ports) ''' block7 = ''' data = [train_df, test_df] for dataset in data: dataset['Age'] = dataset['Age'].astype(int) dataset.loc[ dataset['Age'] <= 11, 'Age'] = 0 dataset.loc[(dataset['Age'] > 11) & (dataset['Age'] <= 18), 'Age'] = 1 dataset.loc[(dataset['Age'] > 18) & (dataset['Age'] <= 22), 'Age'] = 2 dataset.loc[(dataset['Age'] > 22) & (dataset['Age'] <= 27), 'Age'] = 3 dataset.loc[(dataset['Age'] > 27) & (dataset['Age'] <= 33), 'Age'] = 4 dataset.loc[(dataset['Age'] > 33) & (dataset['Age'] <= 40), 'Age'] = 5 dataset.loc[(dataset['Age'] > 40) & (dataset['Age'] <= 66), 'Age'] = 6 dataset.loc[ dataset['Age'] > 66, 'Age'] = 6 # let's see how it's distributed train_df['Age'].value_counts() ''' block8 = ''' data = [train_df, test_df] for dataset in data: dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0 dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1 dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare'] = 2 dataset.loc[(dataset['Fare'] > 31) & (dataset['Fare'] <= 99), 'Fare'] = 3 dataset.loc[(dataset['Fare'] > 99) & (dataset['Fare'] <= 250), 'Fare'] = 4 dataset.loc[ dataset['Fare'] > 250, 'Fare'] = 5 dataset['Fare'] = dataset['Fare'].astype(int) ''' block9 = ''' data = [train_df, test_df] for dataset in data: dataset['Age_Class']= dataset['Age']* dataset['Pclass'] ''' block10 = ''' for dataset in data: dataset['Fare_Per_Person'] = dataset['Fare']/(dataset['relatives']+1) dataset['Fare_Per_Person'] = dataset['Fare_Per_Person'].astype(int) # Let's take a last look at the training set, before we start training the models. train_df.head(10) ''' block11 = ''' train_labels = train_df[PREDICTION_LABEL] train_df = train_df.drop(PREDICTION_LABEL, axis=1) ''' data_saving_block = ''' # -----------------------DATA SAVING START--------------------------------- from kale.marshal import utils as _kale_marshal_utils _kale_marshal_utils.set_kale_data_directory("/marshal") _kale_marshal_utils.save(train_df, "train_df") _kale_marshal_utils.save(train_labels, "train_labels") # -----------------------DATA SAVING END----------------------------------- ''' # run the code blocks inside a jupyter kernel from kale.utils.jupyter_utils import run_code as _kale_run_code from kale.utils.jupyter_utils import update_uimetadata as _kale_update_uimetadata blocks = (data_loading_block, block1, block2, block3, block4, block5, block6, block7, block8, block9, block10, block11, data_saving_block) html_artifact = _kale_run_code(blocks) with open("/featureengineering.html", "w") as f: f.write(html_artifact) _kale_update_uimetadata('featureengineering')