Exemple #1
0
def randomforest(vol_shared_volume: str):

    import os
    import shutil
    from kale.utils import pod_utils
    from kale.marshal import resource_save as _kale_resource_save
    from kale.marshal import resource_load as _kale_resource_load

    _kale_data_directory = "/shared_volume/notebooks/titanic/.titanic_dataset_ml.ipynb.kale.marshal.dir"

    if not os.path.isdir(_kale_data_directory):
        os.makedirs(_kale_data_directory, exist_ok=True)

    # -----------------------DATA LOADING START--------------------------------
    _kale_directory_file_names = [
        os.path.splitext(f)[0] for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f))
    ]

    if "train_labels" not in _kale_directory_file_names:
        raise ValueError("train_labels" + " does not exists in directory")

    _kale_load_file_name = [
        f for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f))
        and os.path.splitext(f)[0] == "train_labels"
    ]
    if len(_kale_load_file_name) > 1:
        raise ValueError("Found multiple files with name " + "train_labels" +
                         ": " + str(_kale_load_file_name))
    _kale_load_file_name = _kale_load_file_name[0]
    train_labels = _kale_resource_load(
        os.path.join(_kale_data_directory, _kale_load_file_name))

    if "train_df" not in _kale_directory_file_names:
        raise ValueError("train_df" + " does not exists in directory")

    _kale_load_file_name = [
        f for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f))
        and os.path.splitext(f)[0] == "train_df"
    ]
    if len(_kale_load_file_name) > 1:
        raise ValueError("Found multiple files with name " + "train_df" +
                         ": " + str(_kale_load_file_name))
    _kale_load_file_name = _kale_load_file_name[0]
    train_df = _kale_resource_load(
        os.path.join(_kale_data_directory, _kale_load_file_name))
    # -----------------------DATA LOADING END----------------------------------

    import numpy as np
    import pandas as pd
    import seaborn as sns
    from matplotlib import pyplot as plt
    from matplotlib import style

    from sklearn import linear_model
    from sklearn.linear_model import LogisticRegression
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.linear_model import Perceptron
    from sklearn.linear_model import SGDClassifier
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.svm import SVC
    from sklearn.naive_bayes import GaussianNB

    random_forest = RandomForestClassifier(n_estimators=100)
    random_forest.fit(train_df, train_labels)
    acc_random_forest = round(
        random_forest.score(train_df, train_labels) * 100, 2)

    # -----------------------DATA SAVING START---------------------------------
    if "acc_random_forest" in locals():
        _kale_resource_save(
            acc_random_forest,
            os.path.join(_kale_data_directory, "acc_random_forest"))
    else:
        print("_kale_resource_save: `acc_random_forest` not found.")
def data_transformation(EPOCHS: int, STEPS: int, BATCH_SIZE: int,
                        HIDDEN_LAYER_SIZE: str, LEARNING_RATE: float):

    import os
    import shutil
    from kale.utils import pod_utils
    from kale.marshal import resource_save as _kale_resource_save
    from kale.marshal import resource_load as _kale_resource_load

    _kale_data_directory = "/marshal"

    if not os.path.isdir(_kale_data_directory):
        os.makedirs(_kale_data_directory, exist_ok=True)

    # -----------------------DATA LOADING START--------------------------------
    _kale_directory_file_names = [
        os.path.splitext(f)[0] for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f))
    ]

    if "column_names" not in _kale_directory_file_names:
        raise ValueError("column_names" + " does not exists in directory")

    _kale_load_file_name = [
        f for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f))
        and os.path.splitext(f)[0] == "column_names"
    ]
    if len(_kale_load_file_name) > 1:
        raise ValueError("Found multiple files with name " + "column_names" +
                         ": " + str(_kale_load_file_name))
    _kale_load_file_name = _kale_load_file_name[0]
    column_names = _kale_resource_load(
        os.path.join(_kale_data_directory, _kale_load_file_name))

    if "schema" not in _kale_directory_file_names:
        raise ValueError("schema" + " does not exists in directory")

    _kale_load_file_name = [
        f for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f))
        and os.path.splitext(f)[0] == "schema"
    ]
    if len(_kale_load_file_name) > 1:
        raise ValueError("Found multiple files with name " + "schema" + ": " +
                         str(_kale_load_file_name))
    _kale_load_file_name = _kale_load_file_name[0]
    schema = _kale_resource_load(
        os.path.join(_kale_data_directory, _kale_load_file_name))
    # -----------------------DATA LOADING END----------------------------------

    import os
    import shutil
    import logging
    import apache_beam as beam
    import tensorflow as tf
    import tensorflow_transform as tft
    import tensorflow_model_analysis as tfma
    import tensorflow_data_validation as tfdv

    from apache_beam.io import textio
    from apache_beam.io import tfrecordio

    from tensorflow_transform.beam import impl as beam_impl
    from tensorflow_transform.beam.tft_beam_io import transform_fn_io
    from tensorflow_transform.coders.csv_coder import CsvCoder
    from tensorflow_transform.coders.example_proto_coder import ExampleProtoCoder
    from tensorflow_transform.tf_metadata import dataset_metadata
    from tensorflow_transform.tf_metadata import metadata_io
    DATA_DIR = 'data/'
    TRAIN_DATA = os.path.join(DATA_DIR, 'taxi-cab-classification/train.csv')
    EVALUATION_DATA = os.path.join(DATA_DIR,
                                   'taxi-cab-classification/eval.csv')

    # Categorical features are assumed to each have a maximum value in the dataset.
    MAX_CATEGORICAL_FEATURE_VALUES = [24, 31, 12]
    CATEGORICAL_FEATURE_KEYS = [
        'trip_start_hour', 'trip_start_day', 'trip_start_month'
    ]

    DENSE_FLOAT_FEATURE_KEYS = ['trip_miles', 'fare', 'trip_seconds']

    # Number of buckets used by tf.transform for encoding each feature.
    FEATURE_BUCKET_COUNT = 10

    BUCKET_FEATURE_KEYS = [
        'pickup_latitude', 'pickup_longitude', 'dropoff_latitude',
        'dropoff_longitude'
    ]

    # Number of vocabulary terms used for encoding VOCAB_FEATURES by tf.transform
    VOCAB_SIZE = 1000

    # Count of out-of-vocab buckets in which unrecognized VOCAB_FEATURES are hashed.
    OOV_SIZE = 10

    VOCAB_FEATURE_KEYS = [
        'pickup_census_tract', 'dropoff_census_tract', 'payment_type',
        'company', 'pickup_community_area', 'dropoff_community_area'
    ]

    # allow nan values in these features.
    OPTIONAL_FEATURES = [
        'dropoff_latitude', 'dropoff_longitude', 'pickup_census_tract',
        'dropoff_census_tract', 'company', 'trip_seconds',
        'dropoff_community_area'
    ]

    LABEL_KEY = 'tips'
    FARE_KEY = 'fare'
    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)

    # tf.get_logger().setLevel(logging.ERROR)

    def to_dense(tensor):
        """Takes as input a SparseTensor and return a Tensor with correct default value
        Args:
          tensor: tf.SparseTensor
        Returns:
          tf.Tensor with default value
        """
        if not isinstance(tensor, tf.sparse.SparseTensor):
            return tensor
        if tensor.dtype == tf.string:
            default_value = ''
        elif tensor.dtype == tf.float32:
            default_value = 0.0
        elif tensor.dtype == tf.int32:
            default_value = 0
        else:
            raise ValueError(f"Tensor type not recognized: {tensor.dtype}")

        return tf.squeeze(tf.sparse_to_dense(tensor.indices,
                                             [tensor.dense_shape[0], 1],
                                             tensor.values,
                                             default_value=default_value),
                          axis=1)
        # TODO: Update to below version
        # return tf.squeeze(tf.sparse.to_dense(tensor, default_value=default_value), axis=1)

    def preprocess_fn(inputs):
        """tf.transform's callback function for preprocessing inputs.
        Args:
          inputs: map from feature keys to raw not-yet-transformed features.
        Returns:
          Map from string feature key to transformed feature operations.
        """
        outputs = {}
        for key in DENSE_FLOAT_FEATURE_KEYS:
            # Preserve this feature as a dense float, setting nan's to the mean.
            outputs[key] = tft.scale_to_z_score(to_dense(inputs[key]))

        for key in VOCAB_FEATURE_KEYS:
            # Build a vocabulary for this feature.
            if inputs[key].dtype == tf.string:
                vocab_tensor = to_dense(inputs[key])
            else:
                vocab_tensor = tf.as_string(to_dense(inputs[key]))
            outputs[key] = tft.compute_and_apply_vocabulary(
                vocab_tensor,
                vocab_filename='vocab_' + key,
                top_k=VOCAB_SIZE,
                num_oov_buckets=OOV_SIZE)

        for key in BUCKET_FEATURE_KEYS:
            outputs[key] = tft.bucketize(to_dense(inputs[key]),
                                         FEATURE_BUCKET_COUNT)

        for key in CATEGORICAL_FEATURE_KEYS:
            outputs[key] = tf.cast(to_dense(inputs[key]), tf.int64)

        taxi_fare = to_dense(inputs[FARE_KEY])
        taxi_tip = to_dense(inputs[LABEL_KEY])
        # Test if the tip was > 20% of the fare.
        tip_threshold = tf.multiply(taxi_fare, tf.constant(0.2))
        outputs[LABEL_KEY] = tf.logical_and(
            tf.logical_not(tf.math.is_nan(taxi_fare)),
            tf.greater(taxi_tip, tip_threshold))

        for key in outputs:
            if outputs[key].dtype == tf.bool:
                outputs[key] = tft.compute_and_apply_vocabulary(
                    tf.as_string(outputs[key]), vocab_filename='vocab_' + key)

        return outputs

    trns_output = os.path.join(DATA_DIR, "transformed")
    if os.path.exists(trns_output):
        shutil.rmtree(trns_output)

    tft_input_metadata = dataset_metadata.DatasetMetadata(schema)

    runner = 'DirectRunner'
    with beam.Pipeline(runner, options=None) as p:
        with beam_impl.Context(temp_dir=os.path.join(trns_output, 'tmp')):
            converter = CsvCoder(column_names, tft_input_metadata.schema)

            # READ TRAIN DATA
            train_data = (p
                          | 'ReadTrainData' >> textio.ReadFromText(
                              TRAIN_DATA, skip_header_lines=1)
                          | 'DecodeTrainData' >> beam.Map(converter.decode))

            # TRANSFORM TRAIN DATA (and get transform_fn function)
            transformed_dataset, transform_fn = (
                (train_data, tft_input_metadata)
                | beam_impl.AnalyzeAndTransformDataset(preprocess_fn))
            transformed_data, transformed_metadata = transformed_dataset

            # SAVE TRANSFORMED TRAIN DATA
            _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(
                os.path.join(trns_output, 'train'),
                coder=ExampleProtoCoder(transformed_metadata.schema))

            # READ EVAL DATA
            eval_data = (p
                         | 'ReadEvalData' >> textio.ReadFromText(
                             EVALUATION_DATA, skip_header_lines=1)
                         | 'DecodeEvalData' >> beam.Map(converter.decode))

            # TRANSFORM EVAL DATA (using previously created transform_fn function)
            eval_dataset = (eval_data, tft_input_metadata)
            transformed_eval_data, transformed_metadata = (
                (eval_dataset, transform_fn) | beam_impl.TransformDataset())

            # SAVE EVAL DATA
            _ = transformed_eval_data | 'WriteEvalData' >> tfrecordio.WriteToTFRecord(
                os.path.join(trns_output, 'eval'),
                coder=ExampleProtoCoder(transformed_metadata.schema))

            # SAVE transform_fn FUNCTION FOR LATER USE
            # TODO: check out what is the transform function (transform_fn) that came from previous step
            _ = (transform_fn | 'WriteTransformFn' >>
                 transform_fn_io.WriteTransformFn(trns_output))

            # SAVE TRANSFORMED METADATA
            metadata_io.write_metadata(metadata=tft_input_metadata,
                                       path=os.path.join(
                                           trns_output, 'metadata'))

    # -----------------------DATA SAVING START---------------------------------
    if "trns_output" in locals():
        _kale_resource_save(trns_output,
                            os.path.join(_kale_data_directory, "trns_output"))
    else:
        print("_kale_resource_save: `trns_output` not found.")
Exemple #3
0
def datapreprocessing(vol_shared_volume: str):

    import os
    import shutil
    from kale.utils import pod_utils
    from kale.marshal import resource_save as _kale_resource_save
    from kale.marshal import resource_load as _kale_resource_load

    _kale_data_directory = "/shared_volume/notebooks/titanic/.titanic_dataset_ml.ipynb.kale.marshal.dir"

    if not os.path.isdir(_kale_data_directory):
        os.makedirs(_kale_data_directory, exist_ok=True)

    # -----------------------DATA LOADING START--------------------------------
    _kale_directory_file_names = [
        os.path.splitext(f)[0] for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f))
    ]

    if "train_df" not in _kale_directory_file_names:
        raise ValueError("train_df" + " does not exists in directory")

    _kale_load_file_name = [
        f for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f))
        and os.path.splitext(f)[0] == "train_df"
    ]
    if len(_kale_load_file_name) > 1:
        raise ValueError("Found multiple files with name " + "train_df" +
                         ": " + str(_kale_load_file_name))
    _kale_load_file_name = _kale_load_file_name[0]
    train_df = _kale_resource_load(
        os.path.join(_kale_data_directory, _kale_load_file_name))

    if "test_df" not in _kale_directory_file_names:
        raise ValueError("test_df" + " does not exists in directory")

    _kale_load_file_name = [
        f for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f))
        and os.path.splitext(f)[0] == "test_df"
    ]
    if len(_kale_load_file_name) > 1:
        raise ValueError("Found multiple files with name " + "test_df" + ": " +
                         str(_kale_load_file_name))
    _kale_load_file_name = _kale_load_file_name[0]
    test_df = _kale_resource_load(
        os.path.join(_kale_data_directory, _kale_load_file_name))
    # -----------------------DATA LOADING END----------------------------------

    import numpy as np
    import pandas as pd
    import seaborn as sns
    from matplotlib import pyplot as plt
    from matplotlib import style

    from sklearn import linear_model
    from sklearn.linear_model import LogisticRegression
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.linear_model import Perceptron
    from sklearn.linear_model import SGDClassifier
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.svm import SVC
    from sklearn.naive_bayes import GaussianNB

    data = [train_df, test_df]
    for dataset in data:
        dataset['relatives'] = dataset['SibSp'] + dataset['Parch']
        dataset.loc[dataset['relatives'] > 0, 'not_alone'] = 0
        dataset.loc[dataset['relatives'] == 0, 'not_alone'] = 1
        dataset['not_alone'] = dataset['not_alone'].astype(int)
    train_df['not_alone'].value_counts()
    # This does not contribute to a person survival probability
    train_df = train_df.drop(['PassengerId'], axis=1)
    import re
    deck = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "U": 8}
    data = [train_df, test_df]

    for dataset in data:
        dataset['Cabin'] = dataset['Cabin'].fillna("U0")
        dataset['Deck'] = dataset['Cabin'].map(
            lambda x: re.compile("([a-zA-Z]+)").search(x).group())
        dataset['Deck'] = dataset['Deck'].map(deck)
        dataset['Deck'] = dataset['Deck'].fillna(0)
        dataset['Deck'] = dataset['Deck'].astype(int)
    # we can now drop the cabin feature
    train_df = train_df.drop(['Cabin'], axis=1)
    test_df = test_df.drop(['Cabin'], axis=1)
    data = [train_df, test_df]

    for dataset in data:
        mean = train_df["Age"].mean()
        std = test_df["Age"].std()
        is_null = dataset["Age"].isnull().sum()
        # compute random numbers between the mean, std and is_null
        rand_age = np.random.randint(mean - std, mean + std, size=is_null)
        # fill NaN values in Age column with random values generated
        age_slice = dataset["Age"].copy()
        age_slice[np.isnan(age_slice)] = rand_age
        dataset["Age"] = age_slice
        dataset["Age"] = train_df["Age"].astype(int)
    train_df["Age"].isnull().sum()
    train_df['Embarked'].describe()
    # fill with most common value
    common_value = 'S'
    data = [train_df, test_df]

    for dataset in data:
        dataset['Embarked'] = dataset['Embarked'].fillna(common_value)
    train_df.info()

    # -----------------------DATA SAVING START---------------------------------
    if "train_df" in locals():
        _kale_resource_save(train_df,
                            os.path.join(_kale_data_directory, "train_df"))
    else:
        print("_kale_resource_save: `train_df` not found.")
    if "test_df" in locals():
        _kale_resource_save(test_df,
                            os.path.join(_kale_data_directory, "test_df"))
    else:
        print("_kale_resource_save: `test_df` not found.")
Exemple #4
0
def featureengineering(vol_shared_volume: str):

    import os
    import shutil
    from kale.utils import pod_utils
    from kale.marshal import resource_save as _kale_resource_save
    from kale.marshal import resource_load as _kale_resource_load

    _kale_data_directory = "/shared_volume/notebooks/titanic/.titanic_dataset_ml.ipynb.kale.marshal.dir"

    if not os.path.isdir(_kale_data_directory):
        os.makedirs(_kale_data_directory, exist_ok=True)

    # -----------------------DATA LOADING START--------------------------------
    _kale_directory_file_names = [
        os.path.splitext(f)[0] for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f))
    ]

    if "PREDICTION_LABEL" not in _kale_directory_file_names:
        raise ValueError("PREDICTION_LABEL" + " does not exists in directory")

    _kale_load_file_name = [
        f for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f))
        and os.path.splitext(f)[0] == "PREDICTION_LABEL"
    ]
    if len(_kale_load_file_name) > 1:
        raise ValueError("Found multiple files with name " +
                         "PREDICTION_LABEL" + ": " + str(_kale_load_file_name))
    _kale_load_file_name = _kale_load_file_name[0]
    PREDICTION_LABEL = _kale_resource_load(
        os.path.join(_kale_data_directory, _kale_load_file_name))

    if "train_df" not in _kale_directory_file_names:
        raise ValueError("train_df" + " does not exists in directory")

    _kale_load_file_name = [
        f for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f))
        and os.path.splitext(f)[0] == "train_df"
    ]
    if len(_kale_load_file_name) > 1:
        raise ValueError("Found multiple files with name " + "train_df" +
                         ": " + str(_kale_load_file_name))
    _kale_load_file_name = _kale_load_file_name[0]
    train_df = _kale_resource_load(
        os.path.join(_kale_data_directory, _kale_load_file_name))

    if "test_df" not in _kale_directory_file_names:
        raise ValueError("test_df" + " does not exists in directory")

    _kale_load_file_name = [
        f for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f))
        and os.path.splitext(f)[0] == "test_df"
    ]
    if len(_kale_load_file_name) > 1:
        raise ValueError("Found multiple files with name " + "test_df" + ": " +
                         str(_kale_load_file_name))
    _kale_load_file_name = _kale_load_file_name[0]
    test_df = _kale_resource_load(
        os.path.join(_kale_data_directory, _kale_load_file_name))
    # -----------------------DATA LOADING END----------------------------------

    import numpy as np
    import pandas as pd
    import seaborn as sns
    from matplotlib import pyplot as plt
    from matplotlib import style

    from sklearn import linear_model
    from sklearn.linear_model import LogisticRegression
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.linear_model import Perceptron
    from sklearn.linear_model import SGDClassifier
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.svm import SVC
    from sklearn.naive_bayes import GaussianNB

    data = [train_df, test_df]

    for dataset in data:
        dataset['Fare'] = dataset['Fare'].fillna(0)
        dataset['Fare'] = dataset['Fare'].astype(int)
    data = [train_df, test_df]
    titles = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}

    for dataset in data:
        # extract titles
        dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.',
                                                    expand=False)
        # replace titles with a more common title or as Rare
        dataset['Title'] = dataset['Title'].replace([
            'Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev',
            'Sir', 'Jonkheer', 'Dona'
        ], 'Rare')
        dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
        dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
        dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
        # convert titles into numbers
        dataset['Title'] = dataset['Title'].map(titles)
        # filling NaN with 0, to get safe
        dataset['Title'] = dataset['Title'].fillna(0)
    train_df = train_df.drop(['Name'], axis=1)
    test_df = test_df.drop(['Name'], axis=1)
    genders = {"male": 0, "female": 1}
    data = [train_df, test_df]

    for dataset in data:
        dataset['Sex'] = dataset['Sex'].map(genders)
    train_df = train_df.drop(['Ticket'], axis=1)
    test_df = test_df.drop(['Ticket'], axis=1)
    ports = {"S": 0, "C": 1, "Q": 2}
    data = [train_df, test_df]

    for dataset in data:
        dataset['Embarked'] = dataset['Embarked'].map(ports)
    data = [train_df, test_df]
    for dataset in data:
        dataset['Age'] = dataset['Age'].astype(int)
        dataset.loc[dataset['Age'] <= 11, 'Age'] = 0
        dataset.loc[(dataset['Age'] > 11) & (dataset['Age'] <= 18), 'Age'] = 1
        dataset.loc[(dataset['Age'] > 18) & (dataset['Age'] <= 22), 'Age'] = 2
        dataset.loc[(dataset['Age'] > 22) & (dataset['Age'] <= 27), 'Age'] = 3
        dataset.loc[(dataset['Age'] > 27) & (dataset['Age'] <= 33), 'Age'] = 4
        dataset.loc[(dataset['Age'] > 33) & (dataset['Age'] <= 40), 'Age'] = 5
        dataset.loc[(dataset['Age'] > 40) & (dataset['Age'] <= 66), 'Age'] = 6
        dataset.loc[dataset['Age'] > 66, 'Age'] = 6

    # let's see how it's distributed train_df['Age'].value_counts()
    data = [train_df, test_df]

    for dataset in data:
        dataset.loc[dataset['Fare'] <= 7.91, 'Fare'] = 0
        dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454),
                    'Fare'] = 1
        dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31),
                    'Fare'] = 2
        dataset.loc[(dataset['Fare'] > 31) & (dataset['Fare'] <= 99),
                    'Fare'] = 3
        dataset.loc[(dataset['Fare'] > 99) & (dataset['Fare'] <= 250),
                    'Fare'] = 4
        dataset.loc[dataset['Fare'] > 250, 'Fare'] = 5
        dataset['Fare'] = dataset['Fare'].astype(int)
    data = [train_df, test_df]
    for dataset in data:
        dataset['Age_Class'] = dataset['Age'] * dataset['Pclass']
    for dataset in data:
        dataset['Fare_Per_Person'] = dataset['Fare'] / (dataset['relatives'] +
                                                        1)
        dataset['Fare_Per_Person'] = dataset['Fare_Per_Person'].astype(int)
    # Let's take a last look at the training set, before we start training the models.
    train_df.head(10)
    train_labels = train_df[PREDICTION_LABEL]
    train_df = train_df.drop(PREDICTION_LABEL, axis=1)

    # -----------------------DATA SAVING START---------------------------------
    if "train_labels" in locals():
        _kale_resource_save(train_labels,
                            os.path.join(_kale_data_directory, "train_labels"))
    else:
        print("_kale_resource_save: `train_labels` not found.")
    if "train_df" in locals():
        _kale_resource_save(train_df,
                            os.path.join(_kale_data_directory, "train_df"))
    else:
        print("_kale_resource_save: `train_df` not found.")
def data_validation(EPOCHS: int, STEPS: int, BATCH_SIZE: int,
                    HIDDEN_LAYER_SIZE: str, LEARNING_RATE: float):

    import os
    import shutil
    from kale.utils import pod_utils
    from kale.marshal import resource_save as _kale_resource_save
    from kale.marshal import resource_load as _kale_resource_load

    _kale_data_directory = "/marshal"

    if not os.path.isdir(_kale_data_directory):
        os.makedirs(_kale_data_directory, exist_ok=True)

    import os
    import shutil
    import logging
    import apache_beam as beam
    import tensorflow as tf
    import tensorflow_transform as tft
    import tensorflow_model_analysis as tfma
    import tensorflow_data_validation as tfdv

    from apache_beam.io import textio
    from apache_beam.io import tfrecordio

    from tensorflow_transform.beam import impl as beam_impl
    from tensorflow_transform.beam.tft_beam_io import transform_fn_io
    from tensorflow_transform.coders.csv_coder import CsvCoder
    from tensorflow_transform.coders.example_proto_coder import ExampleProtoCoder
    from tensorflow_transform.tf_metadata import dataset_metadata
    from tensorflow_transform.tf_metadata import metadata_io
    DATA_DIR = 'data/'
    TRAIN_DATA = os.path.join(DATA_DIR, 'taxi-cab-classification/train.csv')
    EVALUATION_DATA = os.path.join(DATA_DIR,
                                   'taxi-cab-classification/eval.csv')

    # Categorical features are assumed to each have a maximum value in the dataset.
    MAX_CATEGORICAL_FEATURE_VALUES = [24, 31, 12]
    CATEGORICAL_FEATURE_KEYS = [
        'trip_start_hour', 'trip_start_day', 'trip_start_month'
    ]

    DENSE_FLOAT_FEATURE_KEYS = ['trip_miles', 'fare', 'trip_seconds']

    # Number of buckets used by tf.transform for encoding each feature.
    FEATURE_BUCKET_COUNT = 10

    BUCKET_FEATURE_KEYS = [
        'pickup_latitude', 'pickup_longitude', 'dropoff_latitude',
        'dropoff_longitude'
    ]

    # Number of vocabulary terms used for encoding VOCAB_FEATURES by tf.transform
    VOCAB_SIZE = 1000

    # Count of out-of-vocab buckets in which unrecognized VOCAB_FEATURES are hashed.
    OOV_SIZE = 10

    VOCAB_FEATURE_KEYS = [
        'pickup_census_tract', 'dropoff_census_tract', 'payment_type',
        'company', 'pickup_community_area', 'dropoff_community_area'
    ]

    # allow nan values in these features.
    OPTIONAL_FEATURES = [
        'dropoff_latitude', 'dropoff_longitude', 'pickup_census_tract',
        'dropoff_census_tract', 'company', 'trip_seconds',
        'dropoff_community_area'
    ]

    LABEL_KEY = 'tips'
    FARE_KEY = 'fare'
    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
    # tf.get_logger().setLevel(logging.ERROR)

    vldn_output = os.path.join(DATA_DIR, 'validation')

    # TODO: Understand why this was used in the conversion to the output json
    # key columns: list of the names for columns that should be treated as unique keys.
    key_columns = ['trip_start_timestamp']

    # read the first line of the cvs to have and ordered list of column names
    # (the Schema will scrable the features)
    with open(TRAIN_DATA) as f:
        column_names = f.readline().strip().split(',')

    stats = tfdv.generate_statistics_from_csv(data_location=TRAIN_DATA)
    schema = tfdv.infer_schema(stats)

    eval_stats = tfdv.generate_statistics_from_csv(
        data_location=EVALUATION_DATA)
    anomalies = tfdv.validate_statistics(eval_stats, schema)

    # Log anomalies
    for feature_name, anomaly_info in anomalies.anomaly_info.items():
        logging.getLogger().error('Anomaly in feature "{}": {}'.format(
            feature_name, anomaly_info.description))

    # show inferred schema
    tfdv.display_schema(schema=schema)
    # Resolve anomalies
    company = tfdv.get_feature(schema, 'company')
    company.distribution_constraints.min_domain_mass = 0.9

    # Add new value to the domain of feature payment_type.
    payment_type_domain = tfdv.get_domain(schema, 'payment_type')
    payment_type_domain.value.append('Prcard')

    # Validate eval stats after updating the schema
    updated_anomalies = tfdv.validate_statistics(eval_stats, schema)
    tfdv.display_anomalies(updated_anomalies)

    # -----------------------DATA SAVING START---------------------------------
    if "column_names" in locals():
        _kale_resource_save(column_names,
                            os.path.join(_kale_data_directory, "column_names"))
    else:
        print("_kale_resource_save: `column_names` not found.")
    if "schema" in locals():
        _kale_resource_save(schema, os.path.join(_kale_data_directory,
                                                 "schema"))
    else:
        print("_kale_resource_save: `schema` not found.")
def train(EPOCHS: int, STEPS: int, BATCH_SIZE: int, HIDDEN_LAYER_SIZE: str,
          LEARNING_RATE: float):

    import os
    import shutil
    from kale.utils import pod_utils
    from kale.marshal import resource_save as _kale_resource_save
    from kale.marshal import resource_load as _kale_resource_load

    _kale_data_directory = "/marshal"

    if not os.path.isdir(_kale_data_directory):
        os.makedirs(_kale_data_directory, exist_ok=True)

    # -----------------------DATA LOADING START--------------------------------
    _kale_directory_file_names = [
        os.path.splitext(f)[0] for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f))
    ]

    if "trns_output" not in _kale_directory_file_names:
        raise ValueError("trns_output" + " does not exists in directory")

    _kale_load_file_name = [
        f for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f))
        and os.path.splitext(f)[0] == "trns_output"
    ]
    if len(_kale_load_file_name) > 1:
        raise ValueError("Found multiple files with name " + "trns_output" +
                         ": " + str(_kale_load_file_name))
    _kale_load_file_name = _kale_load_file_name[0]
    trns_output = _kale_resource_load(
        os.path.join(_kale_data_directory, _kale_load_file_name))
    # -----------------------DATA LOADING END----------------------------------

    import os
    import shutil
    import logging
    import apache_beam as beam
    import tensorflow as tf
    import tensorflow_transform as tft
    import tensorflow_model_analysis as tfma
    import tensorflow_data_validation as tfdv

    from apache_beam.io import textio
    from apache_beam.io import tfrecordio

    from tensorflow_transform.beam import impl as beam_impl
    from tensorflow_transform.beam.tft_beam_io import transform_fn_io
    from tensorflow_transform.coders.csv_coder import CsvCoder
    from tensorflow_transform.coders.example_proto_coder import ExampleProtoCoder
    from tensorflow_transform.tf_metadata import dataset_metadata
    from tensorflow_transform.tf_metadata import metadata_io
    DATA_DIR = 'data/'
    TRAIN_DATA = os.path.join(DATA_DIR, 'taxi-cab-classification/train.csv')
    EVALUATION_DATA = os.path.join(DATA_DIR,
                                   'taxi-cab-classification/eval.csv')

    # Categorical features are assumed to each have a maximum value in the dataset.
    MAX_CATEGORICAL_FEATURE_VALUES = [24, 31, 12]
    CATEGORICAL_FEATURE_KEYS = [
        'trip_start_hour', 'trip_start_day', 'trip_start_month'
    ]

    DENSE_FLOAT_FEATURE_KEYS = ['trip_miles', 'fare', 'trip_seconds']

    # Number of buckets used by tf.transform for encoding each feature.
    FEATURE_BUCKET_COUNT = 10

    BUCKET_FEATURE_KEYS = [
        'pickup_latitude', 'pickup_longitude', 'dropoff_latitude',
        'dropoff_longitude'
    ]

    # Number of vocabulary terms used for encoding VOCAB_FEATURES by tf.transform
    VOCAB_SIZE = 1000

    # Count of out-of-vocab buckets in which unrecognized VOCAB_FEATURES are hashed.
    OOV_SIZE = 10

    VOCAB_FEATURE_KEYS = [
        'pickup_census_tract', 'dropoff_census_tract', 'payment_type',
        'company', 'pickup_community_area', 'dropoff_community_area'
    ]

    # allow nan values in these features.
    OPTIONAL_FEATURES = [
        'dropoff_latitude', 'dropoff_longitude', 'pickup_census_tract',
        'dropoff_census_tract', 'company', 'trip_seconds',
        'dropoff_community_area'
    ]

    LABEL_KEY = 'tips'
    FARE_KEY = 'fare'
    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)

    # tf.get_logger().setLevel(logging.ERROR)

    def training_input_fn(transformed_output, transformed_examples, batch_size,
                          target_name):
        """
        Args:
          transformed_output: tft.TFTransformOutput
          transformed_examples: Base filename of examples
          batch_size: Batch size.
          target_name: name of the target column.
        Returns:
          The input function for training or eval.
        """
        dataset = tf.data.experimental.make_batched_features_dataset(
            file_pattern=transformed_examples,
            batch_size=batch_size,
            features=transformed_output.transformed_feature_spec(),
            reader=tf.data.TFRecordDataset,
            shuffle=True)
        transformed_features = dataset.make_one_shot_iterator().get_next()
        transformed_labels = transformed_features.pop(target_name)
        return transformed_features, transformed_labels

    def get_feature_columns():
        """Callback that returns a list of feature columns for building a tf.estimator.
        Returns:
          A list of tf.feature_column.
        """
        return ([
            tf.feature_column.numeric_column(key, shape=())
            for key in DENSE_FLOAT_FEATURE_KEYS
        ] + [
            tf.feature_column.indicator_column(
                tf.feature_column.categorical_column_with_identity(
                    key, num_buckets=VOCAB_SIZE + OOV_SIZE))
            for key in VOCAB_FEATURE_KEYS
        ] + [
            tf.feature_column.indicator_column(
                tf.feature_column.categorical_column_with_identity(
                    key, num_buckets=FEATURE_BUCKET_COUNT, default_value=0))
            for key in BUCKET_FEATURE_KEYS
        ] + [
            tf.feature_column.indicator_column(
                tf.feature_column.categorical_column_with_identity(
                    key, num_buckets=num_buckets, default_value=0))
            for key, num_buckets in zip(CATEGORICAL_FEATURE_KEYS,
                                        MAX_CATEGORICAL_FEATURE_VALUES)
        ])

    training_output = os.path.join(DATA_DIR, "training")
    if os.path.exists(training_output):
        shutil.rmtree(training_output)

    hidden_layer_size = [int(x.strip()) for x in HIDDEN_LAYER_SIZE.split(',')]

    tf_transform_output = tft.TFTransformOutput(trns_output)

    # Set how often to run checkpointing in terms of steps.
    config = tf.estimator.RunConfig(save_checkpoints_steps=1000)
    n_classes = tf_transform_output.vocabulary_size_by_name("vocab_" +
                                                            LABEL_KEY)
    # Create estimator
    estimator = tf.estimator.DNNClassifier(
        feature_columns=get_feature_columns(),
        hidden_units=hidden_layer_size,
        n_classes=n_classes,
        config=config,
        model_dir=training_output)

    # TODO: Simplify all this: https://www.tensorflow.org/guide/premade_estimators
    estimator.train(input_fn=lambda: training_input_fn(
        tf_transform_output, os.path.join(trns_output, 'train' + '*'),
        BATCH_SIZE, "tips"),
                    steps=STEPS)

    # -----------------------DATA SAVING START---------------------------------
    if "tf_transform_output" in locals():
        _kale_resource_save(
            tf_transform_output,
            os.path.join(_kale_data_directory, "tf_transform_output"))
    else:
        print("_kale_resource_save: `tf_transform_output` not found.")
    if "training_input_fn" in locals():
        _kale_resource_save(
            training_input_fn,
            os.path.join(_kale_data_directory, "training_input_fn"))
    else:
        print("_kale_resource_save: `training_input_fn` not found.")
    if "estimator" in locals():
        _kale_resource_save(estimator,
                            os.path.join(_kale_data_directory, "estimator"))
    else:
        print("_kale_resource_save: `estimator` not found.")
    if "trns_output" in locals():
        _kale_resource_save(trns_output,
                            os.path.join(_kale_data_directory, "trns_output"))
    else:
        print("_kale_resource_save: `trns_output` not found.")
def logisticregression(rok_workspace_aidays01_2rlcyd0k8_url: str):

    import os
    import shutil
    from kale.utils import pod_utils
    from kale.marshal import resource_save as _kale_resource_save
    from kale.marshal import resource_load as _kale_resource_load

    _kale_data_directory = "/home/jovyan/examples/titanic-ml-dataset/.titanic_dataset_ml.ipynb.kale.marshal.dir"

    if not os.path.isdir(_kale_data_directory):
        os.makedirs(_kale_data_directory, exist_ok=True)

    pod_utils.snapshot_pipeline_step(
        "titanic-ml-fylgn", "logisticregression",
        "/home/jovyan/examples/titanic-ml-dataset/titanic_dataset_ml.ipynb")

    # -----------------------DATA LOADING START--------------------------------
    _kale_directory_file_names = [
        os.path.splitext(f)[0] for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f))
    ]

    if "train_labels" not in _kale_directory_file_names:
        raise ValueError("train_labels" + " does not exists in directory")

    _kale_load_file_name = [
        f for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f))
        and os.path.splitext(f)[0] == "train_labels"
    ]
    if len(_kale_load_file_name) > 1:
        raise ValueError("Found multiple files with name " + "train_labels" +
                         ": " + str(_kale_load_file_name))
    _kale_load_file_name = _kale_load_file_name[0]
    train_labels = _kale_resource_load(
        os.path.join(_kale_data_directory, _kale_load_file_name))

    if "train_df" not in _kale_directory_file_names:
        raise ValueError("train_df" + " does not exists in directory")

    _kale_load_file_name = [
        f for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f))
        and os.path.splitext(f)[0] == "train_df"
    ]
    if len(_kale_load_file_name) > 1:
        raise ValueError("Found multiple files with name " + "train_df" +
                         ": " + str(_kale_load_file_name))
    _kale_load_file_name = _kale_load_file_name[0]
    train_df = _kale_resource_load(
        os.path.join(_kale_data_directory, _kale_load_file_name))
    # -----------------------DATA LOADING END----------------------------------

    import numpy as np
    import pandas as pd
    import seaborn as sns
    from matplotlib import pyplot as plt
    from matplotlib import style

    from sklearn import linear_model
    from sklearn.linear_model import LogisticRegression
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.linear_model import Perceptron
    from sklearn.linear_model import SGDClassifier
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.svm import SVC
    from sklearn.naive_bayes import GaussianNB

    logreg = LogisticRegression(solver='lbfgs', max_iter=110)
    logreg.fit(train_df, train_labels)
    acc_log = round(logreg.score(train_df, train_labels) * 100, 2)

    # -----------------------DATA SAVING START---------------------------------
    if "acc_log" in locals():
        _kale_resource_save(acc_log,
                            os.path.join(_kale_data_directory, "acc_log"))
    else:
        print("_kale_resource_save: `acc_log` not found.")