Esempio n. 1
0
    def test_serialize_deserialize_math_binary(self):
        add_transformer = self._new_add_math_binary()

        file_path = '{}{}'.format(
            'jar:file:', os.path.join(self.tmp_dir, 'math_binary.zip'))

        add_transformer.serializeToBundle(file_path, self.input)
        deserialized_math_binary = SimpleSparkSerializer(
        ).deserializeFromBundle(file_path)
        result = deserialized_math_binary.transform(
            self.input).toPandas()[['add(f1, f2)']]
        assert_frame_equal(self.expected_add, result)
Esempio n. 2
0
def _serialize_to_file(path, df_for_serializing, model):
    if os.path.exists(path):
        os.remove(path)
    path_dir = os.path.dirname(path)
    if not os.path.exists(path_dir):
        os.makedirs(path_dir)
    SimpleSparkSerializer().serializeToBundle(model, _to_file_path(path), df_for_serializing)
Esempio n. 3
0
    def test_serialize_deserialize_pipeline(self):
        add_transformer = self._new_add_math_binary()

        mul_transformer = MathBinary(
            operation=BinaryOperation.Multiply,
            inputA="f1",
            inputB="add(f1, f2)",
            outputCol="mul(f1, add(f1, f2))",
        )

        expected = pd.DataFrame(
            [(float(i * (i + i * 2))) for i in range(1, 10)],
            columns=['mul(f1, add(f1, f2))'],
        )

        pipeline = Pipeline(stages=[add_transformer, mul_transformer])

        pipeline_model = pipeline.fit(self.input)

        file_path = '{}{}'.format(
            'jar:file:', os.path.join(self.tmp_dir,
                                      'math_binary_pipeline.zip'))

        pipeline_model.serializeToBundle(file_path, self.input)
        deserialized_pipeline = SimpleSparkSerializer().deserializeFromBundle(
            file_path)

        result = pipeline_model.transform(
            self.input).toPandas()[['mul(f1, add(f1, f2))']]
        assert_frame_equal(expected, result)
Esempio n. 4
0
def main():
    spark = SparkSession.builder.appName("DBPediaSpark").getOrCreate()

    args = getResolvedOptions(sys.argv, ['S3_INPUT_BUCKET',
                                         'S3_INPUT_KEY_PREFIX',
                                         'S3_OUTPUT_BUCKET',
                                         'S3_OUTPUT_KEY_PREFIX',
                                         'S3_MODEL_BUCKET',
                                         'S3_MODEL_KEY_PREFIX'])

    # This is needed to save RDDs which is the only way to write nested Dataframes into CSV format
    spark.sparkContext._jsc.hadoopConfiguration().set("mapred.output.committer.class",
                                                      "org.apache.hadoop.mapred.FileOutputCommitter")
    
    # Defining the schema corresponding to the input data. The input data does not contain the headers
    schema = StructType([StructField("label", IntegerType(), True), 
                         StructField("title", StringType(), True), 
                         StructField("abstract", StringType(), True)])
    
    # Download the data from S3 into two separate Dataframes
    traindf = spark.read.csv(('s3://' + os.path.join(args['S3_INPUT_BUCKET'], args['S3_INPUT_KEY_PREFIX'],
                                                   'train.csv')), header=False, schema=schema, encoding='UTF-8')
    validationdf = spark.read.csv(('s3://' + os.path.join(args['S3_INPUT_BUCKET'], args['S3_INPUT_KEY_PREFIX'],
                                                          'test.csv')), header=False, schema=schema, encoding='UTF-8')

    # Tokenize the abstract column which contains the input text
    tokenizer = Tokenizer(inputCol="abstract", outputCol="tokenized_abstract")

    # Save transformed training data to CSV in S3 by converting to RDD.
    transformed_traindf = tokenizer.transform(traindf)
    transformed_train_rdd = transformed_traindf.rdd.map(lambda x: (x.label, x.tokenized_abstract))
    lines = transformed_train_rdd.map(csv_line)
    lines.coalesce(1).saveAsTextFile('s3://' + os.path.join(args['S3_OUTPUT_BUCKET'], args['S3_OUTPUT_KEY_PREFIX'], 'train'))

    # Similar data processing for validation dataset.
    transformed_validation = tokenizer.transform(validationdf)
    transformed_validation_rdd = transformed_validation.rdd.map(lambda x: (x.label, x.tokenized_abstract))
    lines = transformed_validation_rdd.map(csv_line)
    lines.coalesce(1).saveAsTextFile('s3://' + os.path.join(args['S3_OUTPUT_BUCKET'], args['S3_OUTPUT_KEY_PREFIX'], 'validation'))

    # Serialize the tokenizer via MLeap and upload to S3
    SimpleSparkSerializer().serializeToBundle(tokenizer, "jar:file:/tmp/model.zip", transformed_validation)

    # Unzip as SageMaker expects a .tar.gz file but MLeap produces a .zip file.
    import zipfile
    with zipfile.ZipFile("/tmp/model.zip") as zf:
        zf.extractall("/tmp/model")

    # Write back the content as a .tar.gz file
    import tarfile
    with tarfile.open("/tmp/model.tar.gz", "w:gz") as tar:
        tar.add("/tmp/model/bundle.json", arcname='bundle.json')
        tar.add("/tmp/model/root", arcname='root')

    s3 = boto3.resource('s3')
    file_name = os.path.join(args['S3_MODEL_KEY_PREFIX'], 'model.tar.gz')
    s3.Bucket(args['S3_MODEL_BUCKET']).upload_file('/tmp/model.tar.gz', file_name)
Esempio n. 5
0
def main():
    # Initialize Spark session and variables
    spark = SparkSession.builder.appName("PySparkAbalone").getOrCreate()
    args = getResolvedOptions(sys.argv, [
        'S3_INPUT_BUCKET', 'S3_INPUT_KEY_PREFIX', 'S3_OUTPUT_BUCKET',
        'S3_OUTPUT_KEY_PREFIX', 'S3_MODEL_BUCKET', 'S3_MODEL_KEY_PREFIX'
    ])

    # Save RDDs which is the only way to write nested Dataframes into CSV format
    spark.sparkContext._jsc.hadoopConfiguration().set(
        "mapred.output.committer.class",
        "org.apache.hadoop.mapred.FileOutputCommitter")

    # Defining the schema corresponding to the input data.
    schema = StructType([
        StructField("sex", StringType(), True),
        StructField("length", DoubleType(), True),
        StructField("diameter", DoubleType(), True),
        StructField("height", DoubleType(), True),
        StructField("whole_weight", DoubleType(), True),
        StructField("shucked_weight", DoubleType(), True),
        StructField("viscera_weight", DoubleType(), True),
        StructField("shell_weight", DoubleType(), True),
        StructField("rings", DoubleType(), True)
    ])

    # Downloading the data from S3 into a Dataframe
    s3_path = 's3://' + os.path.join(
        args['S3_INPUT_BUCKET'], args['S3_INPUT_KEY_PREFIX'], 'abalone.csv')
    total_df = spark.read.csv(s3_path, header=False, schema=schema)

    # Build a feature preprocessing pipeline for categorical values, one-hot-encoding and vectorization
    cols = [
        "sex_vec", "length", "diameter", "height", "whole_weight",
        "shucked_weight", "viscera_weight", "shell_weight"
    ]
    pipeline = Pipeline(stages=[
        StringIndexer(inputCol='sex', outputCol='indexed_sex'),
        OneHotEncoder(inputCol="indexed_sex", outputCol="sex_vec"),
        VectorAssembler(inputCols=cols, outputCol="features")
    ])

    # Fit the data to our pipeline and split into training and validation sets
    etl = pipeline.fit(total_df)
    transformed_total_df = etl.transform(total_df)
    train_df, val_df = transformed_total_df.randomSplit([0.8, 0.2])

    # Convert train and val sets into RDD, save as CSV and upload to S3
    for df, name in [(train_df, 'train'), (val_df, 'valid')]:
        rdd = df.rdd.map(lambda x: (x.rings, x.features)).map(csv_line)
        rdd.saveAsTextFile('s3://' + os.path.join(
            args['S3_OUTPUT_BUCKET'], args['S3_OUTPUT_KEY_PREFIX'], name))

    # Serialize ETL pipeline, convert into tar.gz file and store binary using MLeap
    SimpleSparkSerializer().serializeToBundle(etl, "jar:file:/tmp/model.zip",
                                              val_df)
    with zipfile.ZipFile("/tmp/model.zip") as zf:
        zf.extractall("/tmp/model")

    with tarfile.open("/tmp/model.tar.gz", "w:gz") as tar:
        tar.add("/tmp/model/bundle.json", arcname='bundle.json')
        tar.add("/tmp/model/root", arcname='root')

    # Upload the ETL pipeline in tar.gz format to S3 so that it can be used with SageMaker for inference later
    s3 = boto3.resource('s3')
    file_name = os.path.join(args['S3_MODEL_KEY_PREFIX'], 'model.tar.gz')
    s3.Bucket(args['S3_MODEL_BUCKET']).upload_file('/tmp/model.tar.gz',
                                                   file_name)
Esempio n. 6
0
def main():
    spark = SparkSession.builder.appName("PySparkTitanic").getOrCreate()

    args = getResolvedOptions(
        sys.argv,
        [
            "s3_input_data_location",
            "s3_output_bucket",
            "s3_output_bucket_prefix",
            "s3_model_bucket",
            "s3_model_bucket_prefix",
        ],
    )

    # This is needed to write RDDs to file which is the only way to write nested Dataframes into CSV.
    spark.sparkContext._jsc.hadoopConfiguration().set(
        "mapred.output.committer.class",
        "org.apache.hadoop.mapred.FileOutputCommitter")

    train = spark.read.csv(args["s3_input_data_location"], header=False)

    oldColumns = train.schema.names
    newColumns = [
        "buying", "maint", "doors", "persons", "lug_boot", "safety", "cat"
    ]

    train = reduce(
        lambda train, idx: train.withColumnRenamed(oldColumns[idx], newColumns[
            idx]),
        xrange(len(oldColumns)),
        train,
    )

    # dropping null values
    train = train.dropna()

    # Target label
    catIndexer = StringIndexer(inputCol="cat", outputCol="label")

    labelIndexModel = catIndexer.fit(train)
    train = labelIndexModel.transform(train)

    converter = IndexToString(inputCol="label", outputCol="cat")

    # Spliting in train and test set. Beware : It sorts the dataset
    (traindf, validationdf) = train.randomSplit([0.8, 0.2])

    # Index labels, adding metadata to the label column.
    # Fit on whole dataset to include all labels in index.
    buyingIndexer = StringIndexer(inputCol="buying", outputCol="indexedBuying")
    maintIndexer = StringIndexer(inputCol="maint", outputCol="indexedMaint")
    doorsIndexer = StringIndexer(inputCol="doors", outputCol="indexedDoors")
    personsIndexer = StringIndexer(inputCol="persons",
                                   outputCol="indexedPersons")
    lug_bootIndexer = StringIndexer(inputCol="lug_boot",
                                    outputCol="indexedLug_boot")
    safetyIndexer = StringIndexer(inputCol="safety", outputCol="indexedSafety")

    # One Hot Encoder on indexed features
    buyingEncoder = OneHotEncoder(inputCol="indexedBuying",
                                  outputCol="buyingVec")
    maintEncoder = OneHotEncoder(inputCol="indexedMaint", outputCol="maintVec")
    doorsEncoder = OneHotEncoder(inputCol="indexedDoors", outputCol="doorsVec")
    personsEncoder = OneHotEncoder(inputCol="indexedPersons",
                                   outputCol="personsVec")
    lug_bootEncoder = OneHotEncoder(inputCol="indexedLug_boot",
                                    outputCol="lug_bootVec")
    safetyEncoder = OneHotEncoder(inputCol="indexedSafety",
                                  outputCol="safetyVec")

    # Create the vector structured data (label,features(vector))
    assembler = VectorAssembler(
        inputCols=[
            "buyingVec", "maintVec", "doorsVec", "personsVec", "lug_bootVec",
            "safetyVec"
        ],
        outputCol="features",
    )

    # Chain featurizers in a Pipeline
    pipeline = Pipeline(stages=[
        buyingIndexer,
        maintIndexer,
        doorsIndexer,
        personsIndexer,
        lug_bootIndexer,
        safetyIndexer,
        buyingEncoder,
        maintEncoder,
        doorsEncoder,
        personsEncoder,
        lug_bootEncoder,
        safetyEncoder,
        assembler,
    ])

    # Train model.  This also runs the indexers.
    model = pipeline.fit(traindf)

    # Delete previous data from output
    s3 = boto3.resource("s3")
    bucket = s3.Bucket(args["s3_output_bucket"])

    bucket.objects.filter(Prefix=args["s3_output_bucket_prefix"]).delete()

    # Save transformed training data to CSV in S3 by converting to RDD.
    transformed_traindf = model.transform(traindf)
    transformed_train_rdd = transformed_traindf.rdd.map(lambda x:
                                                        (x.label, x.features))
    lines = transformed_train_rdd.map(toCSVLine)
    lines.saveAsTextFile("s3a://" + args["s3_output_bucket"] + "/" +
                         args["s3_output_bucket_prefix"] + "/" + "train")

    # Similar data processing for validation dataset.
    predictions = model.transform(validationdf)
    transformed_train_rdd = predictions.rdd.map(lambda x:
                                                (x.label, x.features))
    lines = transformed_train_rdd.map(toCSVLine)
    lines.saveAsTextFile("s3a://" + args["s3_output_bucket"] + "/" +
                         args["s3_output_bucket_prefix"] + "/" + "validation")

    # Serialize and store via MLeap
    SimpleSparkSerializer().serializeToBundle(model, "jar:file:/tmp/model.zip",
                                              predictions)

    # Unzipping as SageMaker expects a .tar.gz file but MLeap produces a .zip file.
    import zipfile

    with zipfile.ZipFile("/tmp/model.zip") as zf:
        zf.extractall("/tmp/model")

    # Writing back the content as a .tar.gz file
    import tarfile

    with tarfile.open("/tmp/model.tar.gz", "w:gz") as tar:
        tar.add("/tmp/model/bundle.json", arcname="bundle.json")
        tar.add("/tmp/model/root", arcname="root")

    s3 = boto3.resource("s3")
    file_name = args["s3_model_bucket_prefix"] + "/" + "model.tar.gz"
    s3.Bucket(args["s3_model_bucket"]).upload_file("/tmp/model.tar.gz",
                                                   file_name)

    os.remove("/tmp/model.zip")
    os.remove("/tmp/model.tar.gz")
    shutil.rmtree("/tmp/model")

    # Save postprocessor
    SimpleSparkSerializer().serializeToBundle(converter,
                                              "jar:file:/tmp/postprocess.zip",
                                              predictions)

    with zipfile.ZipFile("/tmp/postprocess.zip") as zf:
        zf.extractall("/tmp/postprocess")

    # Writing back the content as a .tar.gz file
    import tarfile

    with tarfile.open("/tmp/postprocess.tar.gz", "w:gz") as tar:
        tar.add("/tmp/postprocess/bundle.json", arcname="bundle.json")
        tar.add("/tmp/postprocess/root", arcname="root")

    file_name = args["s3_model_bucket_prefix"] + "/" + "postprocess.tar.gz"
    s3.Bucket(args["s3_model_bucket"]).upload_file("/tmp/postprocess.tar.gz",
                                                   file_name)

    os.remove("/tmp/postprocess.zip")
    os.remove("/tmp/postprocess.tar.gz")
    shutil.rmtree("/tmp/postprocess")
    args['S3_BUCKET']))
logger.info('Save train file completed.')

logger.info('Save validation file started...')
# Convert the validation dataframe to RDD to save in CSV format and upload to S3
validation_rdd = validation_df.rdd.map(lambda x:
                                       (x.indexed_breakdown, x.features))
validation_lines = validation_rdd.map(csv_line)
validation_lines.saveAsTextFile('s3://{0}/data/preprocessed/val'.format(
    args['S3_BUCKET']))
logger.info('Save validation file completed.')

# Serialize and store the model via MLeap
timestamp = strftime("%Y-%m-%d-%H-%M-%S", gmtime())
model_filename = '/tmp/model-' + timestamp + '.zip'
SimpleSparkSerializer().serializeToBundle(model, 'jar:file:' + model_filename,
                                          df)

# Unzip the model as SageMaker expects a .tar.gz file but MLeap produces a .zip file
with zipfile.ZipFile(model_filename) as zf:
    zf.extractall("/tmp/model-" + timestamp)

# Write back the content as a .tar.gz file
with tarfile.open("/tmp/model-" + timestamp + ".tar.gz", "w:gz") as tar:
    tar.add("/tmp/model-" + timestamp + "/bundle.json", arcname='bundle.json')
    tar.add("/tmp/model-" + timestamp + "/root", arcname='root')

# Upload the model in tar.gz format to S3 so that it can be used with SageMaker for inference later
s3 = boto3.resource('s3')
s3.Bucket(args['S3_BUCKET']).upload_file('/tmp/model-' + timestamp + '.tar.gz',
                                         'output/sparkml/model.tar.gz')
def main():
    spark = SparkSession.builder.appName("PySparkAbalone").getOrCreate()
    
    args = getResolvedOptions(sys.argv, ['S3_INPUT_BUCKET',
                                         'S3_INPUT_KEY_PREFIX',
                                         'S3_OUTPUT_BUCKET',
                                         'S3_OUTPUT_KEY_PREFIX', 
                                         'S3_MODEL_BUCKET',
                                         'S3_MODEL_KEY_PREFIX'])
    
    # This is needed to save RDDs which is the only way to write nested Dataframes into CSV format
    spark.sparkContext._jsc.hadoopConfiguration().set("mapred.output.committer.class",
                                                      "org.apache.hadoop.mapred.FileOutputCommitter")
    
    # Defining the schema corresponding to the input data. The input data does not contain the headers
    schema = StructType([StructField("sex", StringType(), True), 
                         StructField("length", DoubleType(), True),
                         StructField("diameter", DoubleType(), True),
                         StructField("height", DoubleType(), True),
                         StructField("whole_weight", DoubleType(), True),
                         StructField("shucked_weight", DoubleType(), True),
                         StructField("viscera_weight", DoubleType(), True), 
                         StructField("shell_weight", DoubleType(), True), 
                         StructField("rings", DoubleType(), True)])

    # Downloading the data from S3 into a Dataframe
    total_df = spark.read.csv(('s3://' + os.path.join(args['S3_INPUT_BUCKET'], args['S3_INPUT_KEY_PREFIX'],
                                                   'abalone.csv')), header=False, schema=schema)

    #StringIndexer on the sex column which has categorical value
    sex_indexer = StringIndexer(inputCol="sex", outputCol="indexed_sex")
    
    #one-hot-encoding is being performed on the string-indexed sex column (indexed_sex)
    sex_encoder = OneHotEncoder(inputCol="indexed_sex", outputCol="sex_vec")

    #vector-assembler will bring all the features to a 1D vector for us to save easily into CSV format
    assembler = VectorAssembler(inputCols=["sex_vec", 
                                           "length", 
                                           "diameter", 
                                           "height", 
                                           "whole_weight", 
                                           "shucked_weight", 
                                           "viscera_weight", 
                                           "shell_weight"], 
                                outputCol="features")
    
    # The pipeline comprises of the steps added above
    pipeline = Pipeline(stages=[sex_indexer, sex_encoder, assembler])
    
    # This step trains the feature transformers. We need to serialize this model with MLeap and save to S3
    model = pipeline.fit(total_df)
    
    # This step transforms the dataset with information obtained from the previous fit
    transformed_total_df = model.transform(total_df)
    
    # Split the overall dataset into 80-20 training and validation
    (train_df, validation_df) = transformed_total_df.randomSplit([0.8, 0.2])
    
    # Convert the train dataframe to RDD to save in CSV format and upload to S3
    train_rdd = train_df.rdd.map(lambda x: (x.rings, x.features))
    train_lines = train_rdd.map(csv_line)
    train_lines.saveAsTextFile('s3://' + os.path.join(args['S3_OUTPUT_BUCKET'], args['S3_OUTPUT_KEY_PREFIX'], 'train'))
    
    # Convert the validation dataframe to RDD to save in CSV format and upload to S3
    validation_rdd = validation_df.rdd.map(lambda x: (x.rings, x.features))
    validation_lines = validation_rdd.map(csv_line)
    validation_lines.saveAsTextFile('s3://' + os.path.join(args['S3_OUTPUT_BUCKET'], args['S3_OUTPUT_KEY_PREFIX'], 'validation'))

    # Serialize and store the model via MLeap  
    SimpleSparkSerializer().serializeToBundle(model, "jar:file:/tmp/model.zip", validation_df)

    # Unzip the model as SageMaker expects a .tar.gz file but MLeap produces a .zip file
    import zipfile
    with zipfile.ZipFile("/tmp/model.zip") as zf:
        zf.extractall("/tmp/model")

    # Writw back the content as a .tar.gz file
    import tarfile
    with tarfile.open("/tmp/model.tar.gz", "w:gz") as tar:
        tar.add("/tmp/model/bundle.json", arcname='bundle.json')
        tar.add("/tmp/model/root", arcname='root')
    
    # Upload the model in tar.gz format to S3 so that it can be used with SageMaker for inference later
    s3 = boto3.resource('s3') 
    file_name = os.path.join(args['S3_MODEL_KEY_PREFIX'], 'model.tar.gz')
    s3.Bucket(args['S3_MODEL_BUCKET']).upload_file('/tmp/model.tar.gz', file_name)
Esempio n. 9
0
def _deserialize_from_file(path):
    return SimpleSparkSerializer().deserializeFromBundle(_to_file_path(path))
Esempio n. 10
0
def _serialize_to_file(model, df_for_serializing):
    jar_file_path = _to_jar_file_path(
        os.path.join(tempfile.mkdtemp(), 'test_serialize_to_bundle-pipeline.zip'))
    SimpleSparkSerializer().serializeToBundle(model, jar_file_path, df_for_serializing)
    return jar_file_path
Esempio n. 11
0
def main():
    spark = SparkSession.builder.appName("AbcHeadlinesSpark").getOrCreate()

    #getResolvedOptions (args, options=argument names that you want to retrieve) gives you access to the arguments that are passed to the SparkML script when running a job

    args = getResolvedOptions(sys.argv, [
        'S3_INPUT_BUCKET', 'S3_INPUT_KEY_PREFIX', 'S3_INPUT_FILENAME',
        'S3_OUTPUT_BUCKET', 'S3_OUTPUT_KEY_PREFIX', 'S3_MODEL_BUCKET',
        'S3_MODEL_KEY_PREFIX'
    ])

    #Read the compressed text file containing enron emails encoded as table containing docID, wordID, and count
    abcnewsdf = spark.read.option("header", "true").csv(
        ('s3://' +
         os.path.join(args['S3_INPUT_BUCKET'], args['S3_INPUT_KEY_PREFIX'],
                      args['S3_INPUT_FILENAME'])))

    #Filter number of abc news headlines
    #1,103,663 - headlines
    hdl_cnt = abcnewsdf.count()
    #Filter the number of headlines
    hdl_fil_cnt = hdl_cnt * .1
    hdl_fil_cnt = int(hdl_fil_cnt)
    abcnewsdf = abcnewsdf.limit(hdl_fil_cnt)

    #Create features from text

    #Tokenizer
    tok = Tokenizer(inputCol="headline_text", outputCol="words")

    # stop words
    swr = StopWordsRemover(inputCol="words", outputCol="filtered")

    # Term frequency
    ctv = CountVectorizer(inputCol="filtered",
                          outputCol="tf",
                          vocabSize=200,
                          minDF=2)

    #Term frequency is weighted by number of times the word appears across all docs in corpus
    # Words that are unique to a headline have more weight - since they define the headline
    idf = IDF(inputCol="tf", outputCol="features")

    # Build the pipeline
    news_pl = Pipeline(stages=[tok, swr, ctv, idf])

    #Transformed dataset
    news_pl_fit = news_pl.fit(abcnewsdf)
    news_ftrs_df = news_pl_fit.transform(abcnewsdf)

    gen_str_udf = F.udf(gen_str, StringType())

    #Convert Sparse vector to Dense vector
    news_formatted = news_ftrs_df.withColumn(
        'result', gen_str_udf(news_ftrs_df.features))

    #Save the Dense vector to csv file
    news_save = news_formatted.select("result")
    news_save.write.option("delimiter", "\t").mode("append").csv(
        's3://' +
        os.path.join(args['S3_OUTPUT_BUCKET'], args['S3_OUTPUT_KEY_PREFIX']))

    #Save the vocabulary file
    vocab_list = news_pl_fit.stages[2].vocabulary
    vocab_df = spark.createDataFrame(vocab_list, StringType())

    vocab_df = vocab_df.coalesce(1)

    vocab_df.write.option(
        "delimiter",
        "\n").format("text").mode("append").save('s3://' + os.path.join(
            args['S3_OUTPUT_BUCKET'], args['S3_OUTPUT_KEY_PREFIX']))

    # Serialize the tokenizer via MLeap and upload to S3
    SimpleSparkSerializer().serializeToBundle(news_pl_fit,
                                              "jar:file:/tmp/model.zip",
                                              news_ftrs_df)

    # Unzip as SageMaker expects a .tar.gz file but MLeap produces a .zip file.
    import zipfile
    with zipfile.ZipFile("/tmp/model.zip") as zf:
        zf.extractall("/tmp/model")

    # Write back the content as a .tar.gz file
    import tarfile
    with tarfile.open("/tmp/model.tar.gz", "w:gz") as tar:
        tar.add("/tmp/model/bundle.json", arcname='bundle.json')
        tar.add("/tmp/model/root", arcname='root')

    s3 = boto3.resource('s3')
    file_name = os.path.join(args['S3_MODEL_KEY_PREFIX'], 'model.tar.gz')
    s3.Bucket(args['S3_MODEL_BUCKET']).upload_file('/tmp/model.tar.gz',
                                                   file_name)
Esempio n. 12
0
def main():
    spark = SparkSession.builder.appName("churn-analytics").getOrCreate()
    
    args = getResolvedOptions(sys.argv, ['S3_INPUT_BUCKET',
                                         'S3_INPUT_KEY_PREFIX',
                                         'S3_OUTPUT_BUCKET',
                                         'S3_OUTPUT_KEY_PREFIX', 
                                         'S3_MODEL_BUCKET',
                                         'S3_MODEL_KEY_PREFIX'])
    
    # This is needed to save RDDs which is the only way to write nested Dataframes into CSV format
    spark.sparkContext._jsc.hadoopConfiguration().set("mapred.output.committer.class",
                                                      "org.apache.hadoop.mapred.FileOutputCommitter")

    # Defining the schema corresponding to the input data. The input data does not contain the headers
    callstats_schema = StructType([StructField('State', StringType(), True),
                                   StructField('AccountLength', IntegerType(), True),
                                   StructField('AreaCode', IntegerType(), True),
                                   StructField('Phone', StringType(), True),
                                   StructField('IntlPlan', StringType(), True),
                                   StructField('VMailPlan', StringType(), True),
                                   StructField('VMailMessage', IntegerType(), True),
                                   StructField('DayMins', FloatType(), True),
                                   StructField('DayCalls', IntegerType(), True),
                                   StructField('DayCharge', FloatType(), True),
                                   StructField('EveMins', FloatType(), True),
                                   StructField('EveCalls', IntegerType(), True),
                                   StructField('EveCharge', FloatType(), True),           
                                   StructField('NightMins', FloatType(), True),
                                   StructField('NightCalls', IntegerType(), True),
                                   StructField('NightCharge', FloatType(), True),           
                                   StructField('IntlMins', FloatType(), True), 
                                   StructField('IntlCalls', IntegerType(), True),  
                                   StructField('IntlCharge', FloatType(), True),  
                                   StructField('CustServCalls', IntegerType(), True),
                                   StructField('Churn?', StringType(), True)])

    # Downloading the data from S3 into a Dataframe
    raw_df = spark.read.csv(('s3://' + os.path.join(args['S3_INPUT_BUCKET'], args['S3_INPUT_KEY_PREFIX'],
                                   'churn.csv')), header=True, schema=callstats_schema)
    
    categoricalColumns = ["State", "AreaCode", "IntlPlan", "VMailPlan"]
    stages = [] # stages in our Pipeline

    for categoricalCol in categoricalColumns :
        idxName = categoricalCol+"Idx"
        stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=idxName)
        catVec = categoricalCol+"Vec"
        encoder = OneHotEncoder(inputCol=idxName, outputCol=catVec, dropLast=False)
        stages += [stringIndexer, encoder]
        
    numericCols = ["AccountLength","VMailMessage","DayMins","DayCalls","EveMins","EveCalls","NightMins",
                   "NightCalls","IntlMins","IntlCalls", "CustServCalls"]

    assemblerInputs = numericCols+[c + "Vec" for c in categoricalColumns]
    assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
    stages += [assembler]
    
    mlPipeline = Pipeline().setStages(stages)
    pipelineModel = mlPipeline.fit(raw_df)
    dataset = pipelineModel.transform(raw_df).select('*',col('Churn?').contains('True').cast('integer').alias('labels'))
    
    # Split the overall dataset into 80-20 training and validation
    (train_df, test_df) = dataset.randomSplit([0.8, 0.2])

    # Convert the train dataframe to RDD to save in CSV format and upload to S3
    train_rdd = train_df.rdd.map(lambda r: (r.labels, r.features))
    train_lines = train_rdd.map(csv_line)
    train_lines.saveAsTextFile('s3://' + os.path.join(args['S3_OUTPUT_BUCKET'], args['S3_OUTPUT_KEY_PREFIX'], 'train'))
    
    # Convert the validation dataframe to RDD to save in CSV format and upload to S3
    test_rdd = test_df.rdd.map(lambda r: (r.labels, r.features))
    test_lines = test_rdd.map(csv_line)
    test_lines.saveAsTextFile('s3://' + os.path.join(args['S3_OUTPUT_BUCKET'], args['S3_OUTPUT_KEY_PREFIX'], 'test'))
    
    # Serialize and store the model via MLeap  
    SimpleSparkSerializer().serializeToBundle(pipelineModel, "jar:file:/tmp/model.zip", test_df)
    
    # Unzip the model as SageMaker expects a .tar.gz file but MLeap produces a .zip file
    import zipfile
    with zipfile.ZipFile("/tmp/model.zip") as zf:
        zf.extractall("/tmp/model")

     # Writw back the content as a .tar.gz file
    import tarfile
    with tarfile.open("/tmp/model.tar.gz", "w:gz") as tar:
        tar.add("/tmp/model/bundle.json", arcname='bundle.json')
        tar.add("/tmp/model/root", arcname='root')
    
    # Upload the model in tar.gz format to S3 so that it can be used with SageMaker for inference later
    s3 = boto3.resource('s3') 
    file_name = os.path.join(args['S3_MODEL_KEY_PREFIX'], 'model.tar.gz')
    s3.Bucket(args['S3_MODEL_BUCKET']).upload_file('/tmp/model.tar.gz', file_name)