'location of the training dataset in the local filesystem (will be downloaded if needed)'
)

args = parser.parse_args()

# Initialize SparkSession
conf = SparkConf().setAppName('keras_spark_mnist').set(
    'spark.sql.shuffle.partitions', '16')
if args.master:
    conf.setMaster(args.master)
elif args.num_proc:
    conf.setMaster('local[{}]'.format(args.num_proc))
spark = SparkSession.builder.config(conf=conf).getOrCreate()

# Setup our store for intermediate data
store = Store.create(args.work_dir)

# Download MNIST dataset
data_url = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/mnist.bz2'
libsvm_path = os.path.join(args.data_dir, 'mnist.bz2')
if not os.path.exists(libsvm_path):
    subprocess.check_output(['wget', data_url, '-O', libsvm_path])

# Load dataset into a Spark DataFrame
df = spark.read.format('libsvm') \
    .option('numFeatures', '784') \
    .load(libsvm_path)

# One-hot encode labels into SparseVectors
encoder = OneHotEncoderEstimator(inputCols=['label'],
                                 outputCols=['label_vec'],
def train(
    max_sales: float,
    vocab: Dict[str, List[Any]],
    hp: Hyperparameters,
    work_dir: FlyteDirectory,
    train_df: pyspark.sql.DataFrame,
    working_dir: FlyteDirectory,
):
    print("==============")
    print("Model training")
    print("==============")

    # a method to determine root mean square percentage error of exponential of predictions
    def exp_rmspe(y_true, y_pred):
        """Competition evaluation metric, expects logarmithic inputs."""
        pct = tf.square((tf.exp(y_true) - tf.exp(y_pred)) / tf.exp(y_true))

        # compute mean excluding stores with zero denominator
        x = tf.reduce_sum(tf.where(y_true > 0.001, pct, tf.zeros_like(pct)))
        y = tf.reduce_sum(
            tf.where(y_true > 0.001, tf.ones_like(pct), tf.zeros_like(pct)))
        return tf.sqrt(x / y)

    def act_sigmoid_scaled(x):
        """Sigmoid scaled to logarithm of maximum sales scaled by 20%."""
        return tf.nn.sigmoid(x) * tf.math.log(max_sales) * 1.2

    # NOTE: exp_rmse and act_sigmoid_scaled functions are not placed at the module level
    # this is because we cannot explicitly send max_sales as an argument to act_sigmoid_scaled since it is an activation function
    # two of them are custom objects, and placing one at the module level and the other within the function doesn't really add up

    all_cols = CATEGORICAL_COLS + CONTINUOUS_COLS
    CUSTOM_OBJECTS = {
        "exp_rmspe": exp_rmspe,
        "act_sigmoid_scaled": act_sigmoid_scaled
    }

    # disable GPUs when building the model to prevent memory leaks
    if LooseVersion(tf.__version__) >= LooseVersion("2.0.0"):
        # See https://github.com/tensorflow/tensorflow/issues/33168
        os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
    else:
        K.set_session(
            tf.Session(config=tf.ConfigProto(device_count={"GPU": 0})))

    # build the Keras model
    inputs = {col: Input(shape=(1, ), name=col) for col in all_cols}
    embeddings = [
        Embedding(len(vocab[col]), 10, input_length=1,
                  name="emb_" + col)(inputs[col]) for col in CATEGORICAL_COLS
    ]
    continuous_bn = Concatenate()([
        Reshape((1, 1), name="reshape_" + col)(inputs[col])
        for col in CONTINUOUS_COLS
    ])
    continuous_bn = BatchNormalization()(continuous_bn)
    x = Concatenate()(embeddings + [continuous_bn])
    x = Flatten()(x)
    x = Dense(1000,
              activation="relu",
              kernel_regularizer=tf.keras.regularizers.l2(0.00005))(x)
    x = Dense(1000,
              activation="relu",
              kernel_regularizer=tf.keras.regularizers.l2(0.00005))(x)
    x = Dense(1000,
              activation="relu",
              kernel_regularizer=tf.keras.regularizers.l2(0.00005))(x)
    x = Dense(500,
              activation="relu",
              kernel_regularizer=tf.keras.regularizers.l2(0.00005))(x)
    x = Dropout(0.5)(x)
    # specify element-wise activation
    output = Dense(1, activation=act_sigmoid_scaled)(x)
    model = tf.keras.Model([inputs[f] for f in all_cols], output)
    # display the details of the Keras model
    model.summary()

    opt = tf.keras.optimizers.Adam(lr=hp.learning_rate, epsilon=1e-3)

    # checkpoint callback to specify the options for the returned Keras model
    ckpt_callback = BestModelCheckpoint(monitor="val_loss",
                                        mode="auto",
                                        save_freq="epoch")

    # create an object of Store class
    store = Store.create(work_dir.remote_source)
    # 'SparkBackend' uses `horovod.spark.run` to execute the distributed training function, and
    # returns a list of results by running 'train' on every worker in the cluster
    backend = SparkBackend(
        num_proc=hp.num_proc,
        stdout=sys.stdout,
        stderr=sys.stderr,
        prefix_output_with_timestamp=True,
    )
    # define a Spark Estimator that fits Keras models to a DataFrame
    keras_estimator = hvd.KerasEstimator(
        backend=backend,
        store=store,
        model=model,
        optimizer=opt,
        loss="mae",
        metrics=[exp_rmspe],
        custom_objects=CUSTOM_OBJECTS,
        feature_cols=all_cols,
        label_cols=["Sales"],
        validation="Validation",
        batch_size=hp.batch_size,
        epochs=hp.epochs,
        verbose=2,
        checkpoint_callback=ckpt_callback,
    )

    # The Estimator hides the following details:
    # 1. Binding Spark DataFrames to a deep learning training script
    # 2. Reading data into a format that can be interpreted by the training framework
    # 3. Distributed training using Horovod
    # the user would provide a Keras model to the `KerasEstimator``
    # this `KerasEstimator`` will fit the data and store it in a Spark DataFrame
    keras_model = keras_estimator.fit(train_df).setOutputCols(["Sales_output"])
    # retrieve the model training history
    history = keras_model.getHistory()
    best_val_rmspe = min(history["val_exp_rmspe"])
    print("Best RMSPE: %f" % best_val_rmspe)

    # save the trained model
    keras_model.save(os.path.join(working_dir, hp.local_checkpoint_file))
    print("Written checkpoint to %s" %
          os.path.join(working_dir, hp.local_checkpoint_file))
    # the Estimator returns a Transformer representation of the trained model once training is complete
    return keras_model
def train_model(args):
    # do not run this test for pytorch lightning below min supported verson
    import pytorch_lightning as pl
    if LooseVersion(pl.__version__) < LooseVersion(MIN_PL_VERSION):
        print("Skip test for pytorch_ligthning=={}, min support version is {}".format(pl.__version__, MIN_PL_VERSION))
        return

    # Initialize SparkSession
    conf = SparkConf().setAppName('pytorch_spark_mnist').set('spark.sql.shuffle.partitions', '16')
    if args.master:
        conf.setMaster(args.master)
    elif args.num_proc:
        conf.setMaster('local[{}]'.format(args.num_proc))
    spark = SparkSession.builder.config(conf=conf).getOrCreate()

    # Setup our store for intermediate data
    store = Store.create(args.work_dir)

    # Download MNIST dataset
    data_url = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/mnist.bz2'
    libsvm_path = os.path.join(args.data_dir, 'mnist.bz2')
    if not os.path.exists(libsvm_path):
        subprocess.check_output(['wget', data_url, '-O', libsvm_path])

    # Load dataset into a Spark DataFrame
    df = spark.read.format('libsvm') \
        .option('numFeatures', '784') \
        .load(libsvm_path)

    # One-hot encode labels into SparseVectors
    encoder = OneHotEncoder(inputCols=['label'],
                            outputCols=['label_vec'],
                            dropLast=False)
    model = encoder.fit(df)
    train_df = model.transform(df)

    # Train/test split
    train_df, test_df = train_df.randomSplit([0.9, 0.1])

    # Define the PyTorch model without any Horovod-specific parameters
    class Net(LightningModule):
        def __init__(self):
            super(Net, self).__init__()
            self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
            self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
            self.conv2_drop = nn.Dropout2d()
            self.fc1 = nn.Linear(320, 50)
            self.fc2 = nn.Linear(50, 10)

        def forward(self, x):
            x = x.float().reshape((-1, 1, 28, 28))
            x = F.relu(F.max_pool2d(self.conv1(x), 2))
            x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
            x = x.view(-1, 320)
            x = F.relu(self.fc1(x))
            x = F.dropout(x, training=self.training)
            x = self.fc2(x)
            return F.log_softmax(x, -1)

        def configure_optimizers(self):
            return optim.SGD(self.parameters(), lr=0.01, momentum=0.5)

        def training_step(self, batch, batch_idx):
            if batch_idx == 0:
                print(f"training data batch size: {batch['label'].shape}")
            x, y = batch['features'], batch['label']
            y_hat = self(x)
            loss = F.nll_loss(y_hat, y.long())
            self.log('train_loss', loss)
            return loss

        def validation_step(self, batch, batch_idx):
            if batch_idx == 0:
                print(f"validation data batch size: {batch['label'].shape}")
            x, y = batch['features'], batch['label']
            y_hat = self(x)
            loss = F.nll_loss(y_hat, y.long())
            self.log('val_loss', loss)

        def validation_epoch_end(self, outputs):
            avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean() if len(outputs) > 0 else float('inf')
            self.log('avg_val_loss', avg_loss)

    model = Net()

    # Train a Horovod Spark Estimator on the DataFrame
    backend = SparkBackend(num_proc=args.num_proc,
                           stdout=sys.stdout, stderr=sys.stderr,
                           prefix_output_with_timestamp=True)

    from pytorch_lightning.callbacks import Callback

    epochs = args.epochs

    class MyDummyCallback(Callback):
        def __init__(self):
            self.epcoh_end_counter = 0
            self.train_epcoh_end_counter = 0
            self.validation_epoch_end_counter = 0

        def on_init_start(self, trainer):
            print('Starting to init trainer!')

        def on_init_end(self, trainer):
            print('Trainer is initialized.')

        def on_epoch_end(self, trainer, model):
            print('A train or eval epoch ended.')
            self.epcoh_end_counter += 1

        def on_train_epoch_end(self, trainer, model, unused=None):
            print('A train epoch ended.')
            self.train_epcoh_end_counter += 1

        def on_validation_epoch_end(self, trainer, model, unused=None):
            print('A val epoch ended.')
            self.validation_epoch_end_counter += 1

        def on_train_end(self, trainer, model):
            print("Training ends:"
                  f"epcoh_end_counter={self.epcoh_end_counter}, "
                  f"train_epcoh_end_counter={self.train_epcoh_end_counter}, "
                  f"validation_epoch_end_counter={self.validation_epoch_end_counter} \n")
            assert self.train_epcoh_end_counter <= epochs
            assert self.epcoh_end_counter == self.train_epcoh_end_counter + self.validation_epoch_end_counter

    callbacks = [MyDummyCallback()]

    # added EarlyStopping and ModelCheckpoint
    from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint
    callbacks.append(ModelCheckpoint(monitor='val_loss', mode="min",
                                     save_top_k=1, verbose=True))

    from pytorch_lightning.callbacks.early_stopping import EarlyStopping
    callbacks.append(EarlyStopping(monitor='val_loss',
                                   min_delta=0.001,
                                   patience=3,
                                   verbose=True,
                                   mode='min'))

    torch_estimator = hvd.TorchEstimator(backend=backend,
                                         store=store,
                                         model=model,
                                         input_shapes=[[-1, 1, 28, 28]],
                                         feature_cols=['features'],
                                         label_cols=['label'],
                                         batch_size=args.batch_size,
                                         epochs=args.epochs,
                                         validation=0.1,
                                         verbose=1,
                                         callbacks=callbacks,
                                         profiler="simple" if args.enable_profiler else None)

    torch_model = torch_estimator.fit(train_df).setOutputCols(['label_prob'])

    # Evaluate the model on the held-out test DataFrame
    pred_df = torch_model.transform(test_df)

    argmax = udf(lambda v: float(np.argmax(v)), returnType=T.DoubleType())
    pred_df = pred_df.withColumn('label_pred', argmax(pred_df.label_prob))
    evaluator = MulticlassClassificationEvaluator(predictionCol='label_pred', labelCol='label', metricName='accuracy')
    print('Test accuracy:', evaluator.evaluate(pred_df))

    spark.stop()
Exemple #4
0
def train_model(args):
    # do not run this test for pytorch lightning below min supported verson
    import pytorch_lightning as pl
    if LooseVersion(pl.__version__) < LooseVersion(MIN_PL_VERSION):
        print("Skip test for pytorch_ligthning=={}, min support version is {}".
              format(pl.__version__, MIN_PL_VERSION))
        return

    # Initialize SparkSession
    conf = SparkConf().setAppName('keras_spark_mnist').set(
        'spark.sql.shuffle.partitions', '16')
    if args.master:
        conf.setMaster(args.master)
    elif args.num_proc:
        conf.setMaster('local[{}]'.format(args.num_proc))
    spark = SparkSession.builder.config(conf=conf).getOrCreate()

    # Setup our store for intermediate data
    store = Store.create(args.work_dir)

    # Download MNIST dataset
    data_url = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/mnist.bz2'
    libsvm_path = os.path.join(args.data_dir, 'mnist.bz2')
    if not os.path.exists(libsvm_path):
        subprocess.check_output(['wget', data_url, '-O', libsvm_path])

    # Load dataset into a Spark DataFrame
    df = spark.read.format('libsvm') \
        .option('numFeatures', '784') \
        .load(libsvm_path)

    # One-hot encode labels into SparseVectors
    encoder = OneHotEncoderEstimator(inputCols=['label'],
                                     outputCols=['label_vec'],
                                     dropLast=False)
    model = encoder.fit(df)
    train_df = model.transform(df)

    # Train/test split
    train_df, test_df = train_df.randomSplit([0.9, 0.1])

    # Define the PyTorch model without any Horovod-specific parameters
    class Net(nn.Module):
        def __init__(self):
            super(Net, self).__init__()
            self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
            self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
            self.conv2_drop = nn.Dropout2d()
            self.fc1 = nn.Linear(320, 50)
            self.fc2 = nn.Linear(50, 10)

        def forward(self, x):
            x = x.float()
            x = F.relu(F.max_pool2d(self.conv1(x), 2))
            x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
            x = x.view(-1, 320)
            x = F.relu(self.fc1(x))
            x = F.dropout(x, training=self.training)
            x = self.fc2(x)
            return F.log_softmax(x)

    model = Net()
    optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
    loss = nn.NLLLoss()

    # Train a Horovod Spark Estimator on the DataFrame
    torch_estimator = hvd.TorchEstimator(
        num_proc=args.num_proc,
        store=store,
        model=model,
        optimizer=optimizer,
        loss=lambda input, target: loss(input, target.long()),
        input_shapes=[[-1, 1, 28, 28]],
        feature_cols=['features'],
        label_cols=['label'],
        batch_size=args.batch_size,
        epochs=args.epochs,
        verbose=1)

    torch_model = torch_estimator.fit(train_df).setOutputCols(['label_prob'])

    # Evaluate the model on the held-out test DataFrame
    pred_df = torch_model.transform(test_df)
    argmax = udf(lambda v: float(np.argmax(v)), returnType=T.DoubleType())
    pred_df = pred_df.withColumn('label_pred', argmax(pred_df.label_prob))
    evaluator = MulticlassClassificationEvaluator(predictionCol='label_pred',
                                                  labelCol='label',
                                                  metricName='accuracy')
    print('Test accuracy:', evaluator.evaluate(pred_df))

    spark.stop()