Esempio n. 1
0
    def test_model_onehot_encoder(self):
        encoder = OneHotEncoderEstimator(inputCols=['index'],
                                         outputCols=['indexVec'])
        data = self.spark.createDataFrame([(0.0, ), (1.0, ), (2.0, ), (2.0, ),
                                           (0.0, ), (2.0, )], ['index'])
        model = encoder.fit(data)
        model_onnx = convert_sparkml(model, 'Sparkml OneHotEncoder',
                                     [('index', FloatTensorType([1, 1]))])
        self.assertTrue(model_onnx is not None)
        self.assertTrue(model_onnx.graph.node is not None)
        # run the model
        predicted = model.transform(data)
        data_np = data.select("index").toPandas().values.astype(numpy.float32)
        predicted_np = predicted.select("indexVec").toPandas().indexVec.apply(
            lambda x: x.toArray().tolist()).values
        expected = numpy.asarray(
            [x + [0] if numpy.amax(x) == 1 else x + [1] for x in predicted_np])

        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlOneHotEncoder")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['indexVec'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
Esempio n. 2
0
def encoding2(df, incol, outcol):

    encoder = OneHotEncoderEstimator(inputCols=[incol], outputCols=[outcol])
    encoder = encoder.fit(df)
    df = encoder.transform(df)

    return df  #, encoder
def preprocess(data_frame):
    category_columns = [
        "workclass", "education", "marital_status", "occupation",
        "relationship", "race", "sex", "native_country", "income"
    ]
    index_columns = [col + "_index" for col in category_columns]
    vec_columns = [col + "_vec" for col in category_columns]
    for col in category_columns:
        stringIndexer = StringIndexer(inputCol=col,
                                      outputCol=col + "_index",
                                      handleInvalid='error')
        model = stringIndexer.fit(data_frame)
        data_frame = model.transform(data_frame)
        data_frame = data_frame.drop(col)
    index_columns.pop(-1)
    vec_columns.pop(-1)

    ohe = OneHotEncoderEstimator(inputCols=index_columns,
                                 outputCols=vec_columns)
    ohe_model = ohe.fit(data_frame)
    ohe_df = ohe_model.transform(data_frame)
    ohe_df = ohe_df.drop(*index_columns)
    # ohe_df.show()
    cols = ohe_df.columns
    cols.remove("income_index")
    vector_assembler = VectorAssembler(inputCols=cols, outputCol="features")
    vdata_frame = vector_assembler.transform(ohe_df)
    vdata_frame = vdata_frame.drop(*cols)
    # vdata_frame.show()
    print("Preprocess input data correctly.")
    return vdata_frame
Esempio n. 4
0
def encoding(i, df, col):

    encoder = OneHotEncoderEstimator(inputCols=[col],
                                     outputCols=["p" + str(i)])
    encoder = encoder.fit(df)
    df = encoder.transform(df)

    return df  #, encoder
Esempio n. 5
0
def OneHotEncode(df, columns):
    from pyspark.ml.feature import OneHotEncoderEstimator
    df1 = df
    for col in columns:
        encoder = OneHotEncoderEstimator(inputCols=[col + '_Index'], outputCols=[col+'_Vector'])
        df1 = encoder.fit(df1).transform(df1).drop(col + '_Index')
    print('---> ', type(df1[col+'_Vector']), df1.columns, df1.take(1))
    return df1
Esempio n. 6
0
def oneHotEncoderExample(movieSamples):
    samplesWithIdNumber = movieSamples.withColumn(
        "movieIdNumber",
        F.col("movieId").cast(IntegerType()))
    encoder = OneHotEncoderEstimator(inputCols=["movieIdNumber"],
                                     outputCols=['movieIdVector'],
                                     dropLast=False)
    oneHotEncoderSamples = encoder.fit(samplesWithIdNumber).transform(
        samplesWithIdNumber)
    oneHotEncoderSamples.printSchema()
    oneHotEncoderSamples.show(10)
    def onehot(self):
        from pyspark.ml.feature import OneHotEncoderEstimator

        df = self.session.createDataFrame([(0.0, 1.0), (1.0, 0.0), (2.0, 1.0),
                                           (0.0, 2.0), (0.0, 1.0), (2.0, 0.0)],
                                          ["categoryIndex1", "categoryIndex2"])

        encoder = OneHotEncoderEstimator(
            inputCols=["categoryIndex1", "categoryIndex2"],
            outputCols=["categoryVec1", "categoryVec2"])
        model = encoder.fit(df)
        encoded = model.transform(df)
        encoded.show()
def main(train_data_folder, model_path, onehot_path):

    #Starting session
    spark = SparkSession.builder.appName('BigDataML').getOrCreate()
    spark.sparkContext.setLogLevel("ERROR")

    #Loading data
    data = spark.read.parquet(train_data_folder)
    data = data.dropna(how='any')

    encoder = OneHotEncoderEstimator(inputCols=["station_index"],
                                     outputCols=["station_vector"])
    encoder_model = encoder.fit(data)
    data = encoder_model.transform(data)
    encoder_model.write().overwrite().save(onehot_path)
def get_df(reps=REPS):
    def make_rows(i):
        f = float(i)
        return [
            ("assert", Vectors.sparse(3, [0, 1, 2], [1, 2, 3]), 0,
             Vectors.dense([f + 0.1, f + 1.1])),
            ("require", Vectors.sparse(3, {1: 2}), 1,
             Vectors.dense([f + 2.2, f + 3.2])),
        ]

    rows = []
    for i in range(reps):
        rows.extend(make_rows(i))

    df = sc.parallelize(rows).toDF(["word", "vector", "more", "vorpal"])
    ohe = OneHotEncoderEstimator(inputCols=['more'], outputCols=['more__ohe'])
    return ohe.fit(df).transform(df)
def encode_using_one_hot(df, column_name):
    '''
    Transforms a df at a particular column_name by converting all unique categories to a vector as they get processed.
    ie. If values in column are {a,b,b,c,d} => {<1.0,0,0>, <0,1.0,0>, ... } (Good for non-ordinal categories)
    '''
    indexed_name = 'index_'+column_name
    vectored_name = 'vec_'+column_name

    df = StringIndexer(inputCol=column_name, outputCol=indexed_name,
                       handleInvalid="skip").fit(df).transform(df)

    encoder = OneHotEncoderEstimator(
        inputCols=[indexed_name], outputCols=[vectored_name])
    model = encoder.fit(df)
    df = model.transform(df)
    df = df.drop(indexed_name)
    df = df.drop(column_name)
    df = df.withColumnRenamed(vectored_name, column_name)
    return df
    def test_model_onehot_encoder(self):
        import numpy
        encoder = OneHotEncoderEstimator(inputCols=['index'],
                                         outputCols=['indexVec'])
        data = self.spark.createDataFrame([(0.0, ), (1.0, ), (2.0, ), (2.0, ),
                                           (0.0, ), (2.0, )], ['index'])
        model = encoder.fit(data)
        model_onnx = convert_sparkml(model, 'Sparkml OneHotEncoder',
                                     [('index', FloatTensorType([1, 1]))])
        self.assertTrue(model_onnx is not None)
        self.assertTrue(model_onnx.graph.node is not None)
        # run the model
        predicted = model.transform(data)
        data_np = data.select("index").toPandas().values.astype(numpy.float32)
        predicted_np = predicted.select("indexVec").toPandas().indexVec.apply(
            lambda x: x.toArray().tolist()).values
        predicted_shifted = numpy.asarray(
            [x + [0] if numpy.amax(x) == 1 else x + [1] for x in predicted_np])

        dump_data_and_sparkml_model(data_np,
                                    predicted_shifted,
                                    model,
                                    model_onnx,
                                    basename="SparkmlOneHotEncoder")
def train_model(args):
    # do not run this test for pytorch lightning below min supported verson
    import pytorch_lightning as pl
    if LooseVersion(pl.__version__) < LooseVersion(MIN_PL_VERSION):
        print("Skip test for pytorch_ligthning=={}, min support version is {}".format(pl.__version__, MIN_PL_VERSION))
        return

    # Initialize SparkSession
    conf = SparkConf().setAppName('pytorch_spark_mnist').set('spark.sql.shuffle.partitions', '16')
    if args.master:
        conf.setMaster(args.master)
    elif args.num_proc:
        conf.setMaster('local[{}]'.format(args.num_proc))
    spark = SparkSession.builder.config(conf=conf).getOrCreate()

    # Setup our store for intermediate data
    store = Store.create(args.work_dir)

    # Download MNIST dataset
    data_url = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/mnist.bz2'
    libsvm_path = os.path.join(args.data_dir, 'mnist.bz2')
    if not os.path.exists(libsvm_path):
        subprocess.check_output(['wget', data_url, '-O', libsvm_path])

    # Load dataset into a Spark DataFrame
    df = spark.read.format('libsvm') \
        .option('numFeatures', '784') \
        .load(libsvm_path)

    # One-hot encode labels into SparseVectors
    encoder = OneHotEncoder(inputCols=['label'],
                            outputCols=['label_vec'],
                            dropLast=False)
    model = encoder.fit(df)
    train_df = model.transform(df)

    # Train/test split
    train_df, test_df = train_df.randomSplit([0.9, 0.1])

    # Define the PyTorch model without any Horovod-specific parameters
    class Net(LightningModule):
        def __init__(self):
            super(Net, self).__init__()
            self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
            self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
            self.conv2_drop = nn.Dropout2d()
            self.fc1 = nn.Linear(320, 50)
            self.fc2 = nn.Linear(50, 10)

        def forward(self, x):
            x = x.float().reshape((-1, 1, 28, 28))
            x = F.relu(F.max_pool2d(self.conv1(x), 2))
            x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
            x = x.view(-1, 320)
            x = F.relu(self.fc1(x))
            x = F.dropout(x, training=self.training)
            x = self.fc2(x)
            return F.log_softmax(x, -1)

        def configure_optimizers(self):
            return optim.SGD(self.parameters(), lr=0.01, momentum=0.5)

        def training_step(self, batch, batch_idx):
            if batch_idx == 0:
                print(f"training data batch size: {batch['label'].shape}")
            x, y = batch['features'], batch['label']
            y_hat = self(x)
            loss = F.nll_loss(y_hat, y.long())
            self.log('train_loss', loss)
            return loss

        def validation_step(self, batch, batch_idx):
            if batch_idx == 0:
                print(f"validation data batch size: {batch['label'].shape}")
            x, y = batch['features'], batch['label']
            y_hat = self(x)
            loss = F.nll_loss(y_hat, y.long())
            self.log('val_loss', loss)

        def validation_epoch_end(self, outputs):
            avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean() if len(outputs) > 0 else float('inf')
            self.log('avg_val_loss', avg_loss)

    model = Net()

    # Train a Horovod Spark Estimator on the DataFrame
    backend = SparkBackend(num_proc=args.num_proc,
                           stdout=sys.stdout, stderr=sys.stderr,
                           prefix_output_with_timestamp=True)

    from pytorch_lightning.callbacks import Callback

    epochs = args.epochs

    class MyDummyCallback(Callback):
        def __init__(self):
            self.epcoh_end_counter = 0
            self.train_epcoh_end_counter = 0
            self.validation_epoch_end_counter = 0

        def on_init_start(self, trainer):
            print('Starting to init trainer!')

        def on_init_end(self, trainer):
            print('Trainer is initialized.')

        def on_epoch_end(self, trainer, model):
            print('A train or eval epoch ended.')
            self.epcoh_end_counter += 1

        def on_train_epoch_end(self, trainer, model, unused=None):
            print('A train epoch ended.')
            self.train_epcoh_end_counter += 1

        def on_validation_epoch_end(self, trainer, model, unused=None):
            print('A val epoch ended.')
            self.validation_epoch_end_counter += 1

        def on_train_end(self, trainer, model):
            print("Training ends:"
                  f"epcoh_end_counter={self.epcoh_end_counter}, "
                  f"train_epcoh_end_counter={self.train_epcoh_end_counter}, "
                  f"validation_epoch_end_counter={self.validation_epoch_end_counter} \n")
            assert self.train_epcoh_end_counter <= epochs
            assert self.epcoh_end_counter == self.train_epcoh_end_counter + self.validation_epoch_end_counter

    callbacks = [MyDummyCallback()]

    # added EarlyStopping and ModelCheckpoint
    from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint
    callbacks.append(ModelCheckpoint(monitor='val_loss', mode="min",
                                     save_top_k=1, verbose=True))

    from pytorch_lightning.callbacks.early_stopping import EarlyStopping
    callbacks.append(EarlyStopping(monitor='val_loss',
                                   min_delta=0.001,
                                   patience=3,
                                   verbose=True,
                                   mode='min'))

    torch_estimator = hvd.TorchEstimator(backend=backend,
                                         store=store,
                                         model=model,
                                         input_shapes=[[-1, 1, 28, 28]],
                                         feature_cols=['features'],
                                         label_cols=['label'],
                                         batch_size=args.batch_size,
                                         epochs=args.epochs,
                                         validation=0.1,
                                         verbose=1,
                                         callbacks=callbacks,
                                         profiler="simple" if args.enable_profiler else None)

    torch_model = torch_estimator.fit(train_df).setOutputCols(['label_prob'])

    # Evaluate the model on the held-out test DataFrame
    pred_df = torch_model.transform(test_df)

    argmax = udf(lambda v: float(np.argmax(v)), returnType=T.DoubleType())
    pred_df = pred_df.withColumn('label_pred', argmax(pred_df.label_prob))
    evaluator = MulticlassClassificationEvaluator(predictionCol='label_pred', labelCol='label', metricName='accuracy')
    print('Test accuracy:', evaluator.evaluate(pred_df))

    spark.stop()
data = data.select('Var1', 'Var2', 'Var3', 'Var4', 'Var5', 'Var6', 'Var7',
                   'Var8', 'NVVar1', 'NVVar2', 'NVVar3', 'NVVar4',
                   'Cat1_index', 'Cat3_index', 'Cat6_index', 'Cat8_index',
                   'Cat9_index', 'Cat10_index', 'Cat11_index', 'Cat12_index',
                   'Calendar_Year', 'Model_Year', 'Claim_Amount')
category = [
    'Cat1_index', 'Cat3_index', 'Cat6_index', 'Cat8_index', 'Cat9_index',
    'Cat10_index', 'Cat11_index', 'Cat12_index'
]
new_cat = []
for col in category:
    name = col.replace('_index', '_vec')
    new_cat.append(name)
    num_column.append(name)
encoder = OneHotEncoderEstimator(inputCols=category, outputCols=new_cat)
one_model = encoder.fit(data)
data = one_model.transform(data)

#question 1.3 handle the unbalanced data
data = data.withColumn('weight',
                       when((data['Claim_Amount'] != 0), 0.99).otherwise(0.01))
#allocate label for each row
data = data.withColumn('type',
                       when((data['Claim_Amount'] != 0), 1).otherwise(0))

#feature choose and PCA
data = data.select('Var1', 'Var2', 'Var3', 'Var4', 'Var5', 'Var6', 'Var7',
                   'Var8', 'NVVar1', 'NVVar2', 'NVVar3', 'NVVar4', 'Cat1_vec',
                   'Cat3_vec', 'Cat6_vec', 'Cat8_vec', 'Cat9_vec', 'Cat10_vec',
                   'Cat11_vec', 'Cat12_vec', 'Calendar_Year', 'Model_Year',
                   'Claim_Amount', 'weight', 'type')
Esempio n. 14
0
# LGA — La Guardia Airport (New York)
# SMF — Sacramento
# SJC — San Jose
# TUS — Tucson International Airport
# OGG — Kahului (Hawaii)
# Obviously this is only a small subset of airports. Nevertheless, since this is a categorical variable, it needs to be one-hot encoded before it can be used in a regression model.

# The data are in a variable called flights. You have already used a string indexer to create a column of indexed values corresponding to the strings in org.

# Note:: You might find it useful to revise the slides from the lessons in the Slides panel next to the IPython Shell.

# Instructions
# 100 XP
# Import the one-hot encoder class.
# Create an one-hot encoder instance, naming the output column 'org_dummy'.
# Apply the one-hot encoder to the flights data.
# Generate a summary of the mapping from categorical values to binary encoded dummy variables. Include only unique values and order by org_idx.


# Import the one hot encoder class
from pyspark.ml.feature import OneHotEncoderEstimator

# Create an instance of the one hot encoder
onehot = OneHotEncoderEstimator(inputCols=['org_idx'], outputCols=['org_dummy'])

# Apply the one hot encoder to the flights data
onehot = onehot.fit(flights)
flights_onehot = onehot.transform(flights)

# Check the results
flights_onehot.select('org', 'org_idx', 'org_dummy').distinct().sort('org_idx').show()
Esempio n. 15
0
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol="color", outputCol="color_indexed")

#Next we call the fit() method to initiate the learning process.
indexer_model = indexer.fit(data)

indexed_data = indexer_model.transform(data)
# to view the data
indexed_data.show()

###########################################One Hot Encoding#######################################

from pyspark.ml.feature import OneHotEncoderEstimator
ohe = OneHotEncoderEstimator(inputCols=["color_indexed"],
                             outputCols=["color_ohe"])
ohe_model = ohe.fit(indexed_data)
encoded_data = ohe_model.transform(indexed_data)
encoded_data.show()

############################ Feature Scaling #####################################
'''    
This diversity in scale could cause a lot of problems in some machine learning algorithms e.g. KMeans. 
This is because the algorithm may treat some variables as more dominant according to their value range. 
For example: consider a dataset about employees. We may have a years of experience column that ranges between 0 → 30 
and a salary column with values in thousands. But this does not mean that the salary column is more dominant!
To solve this problem we transform the values to be at the same scale. There are a lot of transformation methods, 
we will look at two of them.
Note that scalers are applied on Vector Data Types that is why we need to collect the features using a VectorAssembler first:

'''
from pyspark.ml.feature import VectorAssembler
# $example on$
from pyspark.ml.feature import OneHotEncoderEstimator
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("OneHotEncoderEstimatorExample")\
        .getOrCreate()

    # Note: categorical features are usually first encoded with StringIndexer
    # $example on$
    df = spark.createDataFrame([
        (0.0, 1.0),
        (1.0, 0.0),
        (2.0, 1.0),
        (0.0, 2.0),
        (0.0, 1.0),
        (2.0, 0.0)
    ], ["categoryIndex1", "categoryIndex2"])

    encoder = OneHotEncoderEstimator(inputCols=["categoryIndex1", "categoryIndex2"],
                                     outputCols=["categoryVec1", "categoryVec2"])
    model = encoder.fit(df)
    encoded = model.transform(df)
    encoded.show()
    # $example off$

    spark.stop()
Esempio n. 17
0
print("Cars with null cyl", cars.filter('cyl IS NULL').count(), '\n')

indexer = StringIndexer(inputCol='type',
                        outputCol='type_idx')

# Assign index values to strings
indexer = indexer.fit(cars)
# Create column with index values
cars = indexer.transform(cars)

cars = cars.withColumn('density', round(cars.weight_kg / cars.length_meters, 2))
cars = cars.withColumn('density_area', round(cars.weight_kg / cars.length_meters**2, 2))
cars = cars.withColumn('density_volume', round(cars.weight_kg / cars.length_meters**3, 2))

onehot = OneHotEncoderEstimator(inputCols=['type_idx'], outputCols=['type_dummy'])
onehot = onehot.fit(cars)
cars = onehot.transform(cars)

pd.set_option('display.max_columns', None) # all cols
pd.set_option('display.width', 161)
pd.set_option('display.max_colwidth', 199)
#print(cars.toPandas().sample(12))

# Check column data types
print('\n', cars.dtypes, '\n')

assembler = VectorAssembler(inputCols=['weight_kg', 'cyl', 'type_dummy', 'density', 'density_area', 'density_volume'],
                            outputCol='features')
cars = assembler.transform(cars)

kars = cars.select('consumption', 'features')
Esempio n. 18
0
]

cat_ohe = []
for col in category_id:
    cat_ = col.replace('_id', '_ohe')
    cat_ohe.append(cat_)
    input_features.append(cat_)

data = raw_df.select('Var1', 'Var2', 'Var3', 'Var4', 'Var5', 'Var6', 'Var7',
                     'Var8', 'NVVar1', 'NVVar2', 'NVVar3', 'NVVar4', 'Cat1_id',
                     'Cat2_id', 'Cat3_id', 'Cat4_id', 'Cat5_id', 'Cat6_id',
                     'Cat7_id', 'Cat8_id', 'Cat9_id', 'Cat10_id', 'Cat11_id',
                     'Cat12_id', 'Calendar_Year', 'Model_Year', 'Claim_Amount')

encoder = OneHotEncoderEstimator(inputCols=category_id, outputCols=cat_ohe)
encoder_data = encoder.fit(data)
data = encoder_data.transform(data)

#assign weight on data
data = data.withColumn('weight',
                       when((data['Claim_Amount'] != 0), 0.98).otherwise(0.02))
data = data.withColumn('not_zero',
                       when((data['Claim_Amount'] != 0), 1).otherwise(0))

data = data.select('Var1', 'Var2', 'Var3', 'Var4', 'Var5', 'Var6', 'Var7',
                   'Var8', 'NVVar1', 'NVVar2', 'NVVar3', 'NVVar4', 'Cat1_ohe',
                   'Cat2_ohe', 'Cat3_ohe', 'Cat4_ohe', 'Cat5_ohe', 'Cat6_ohe',
                   'Cat7_ohe', 'Cat8_ohe', 'Cat9_ohe', 'Cat10_ohe',
                   'Cat11_ohe', 'Cat12_ohe', 'Calendar_Year', 'Model_Year',
                   'Claim_Amount', 'weight', 'not_zero')
    def dataTranform(self, dataInfo):
        featuresColm = dataInfo.get(
            PredictiveConstants.FEATURESCOLM)  # featureColmList -replaced
        labelColm = dataInfo.get(PredictiveConstants.LABELCOLM)
        modelSheetName = dataInfo.get(PredictiveConstants.MODELSHEETNAME)
        modelId = dataInfo.get(PredictiveConstants.MODELID)
        storageLocation = dataInfo.get(PredictiveConstants.LOCATIONADDRESS)

        indexerPathMapping = {}
        oneHotEncoderPathMapping = {}

        self.labelColm = None if labelColm == None else labelColm
        self.featuresColm = None if featuresColm == None else featuresColm
        dataset = self.dataset
        vectorizedFeaturescolmName = modelSheetName + PredictiveConstants.DMXFEATURE
        dataset = dataset.drop(vectorizedFeaturescolmName)

        schemaData = dataset.schema

        if self.labelColm is not None:
            for labelName in self.labelColm:
                label = labelName
        else:
            label = self.labelColm

        nonNumericData = self.nonNumericToString(schemaData=schemaData,
                                                 dataset=dataset)
        categoricalFeatures = nonNumericData.get(
            PredictiveConstants.CATEGORICALFEATURES)
        numericalFeatures = nonNumericData.get(
            PredictiveConstants.NUMERICALFEATURES)
        dataset = nonNumericData.get(PredictiveConstants.DATASET)
        schemaData = dataset.schema

        # indexing of label column
        isLabelIndexed = "no"
        if self.labelColm is not None:
            labelIndexedInfo = self.isLabelIndexed(schemaData=schemaData,
                                                   label=label,
                                                   dataset=dataset)
            dataset = labelIndexedInfo.get(PredictiveConstants.DATASET)
            isLabelIndexed = labelIndexedInfo.get(
                PredictiveConstants.ISLABELINDEXED)
            labelIndexer = labelIndexedInfo.get(
                PredictiveConstants.LABELINDEXER)
            # store the label indexer here.
            if labelIndexer is not None:
                labelIndexerStoragepath = storageLocation + modelId.upper(
                ) + label.upper() + PredictiveConstants.INDEXER.upper(
                ) + PredictiveConstants.PARQUETEXTENSION
                labelIndexer.save(labelIndexerStoragepath)  #correct this
                indexerPathMapping.update({label: labelIndexerStoragepath})

        oneHotEncodedFeaturesList = []
        indexedFeatures = []
        nonOneHotEncoded = []
        for colm in categoricalFeatures:
            indexedColmName = PredictiveConstants.INDEXED_ + colm
            oneHotEncodedColName = PredictiveConstants.ONEHOTENCODED_ + colm
            indexer = StringIndexer(inputCol=colm,
                                    outputCol=indexedColmName,
                                    handleInvalid="skip").fit(dataset)
            dataset = indexer.transform(dataset)
            '''store the indexer here- saving mechanism should be modelid+colmName+indexer.parquet -->
            not storing the features indexer for now but keeping this for future use.'''
            featuresIndexerPath = storageLocation + modelId.upper(
            ) + colm.upper() + PredictiveConstants.INDEXER.upper(
            ) + PredictiveConstants.PARQUETEXTENSION
            indexer.write().overwrite().save(featuresIndexerPath)
            indexerPathMapping.update({colm: featuresIndexerPath})
            rowNo = dataset.select(indexedColmName).distinct().count()
            '''Incase of only one category in the colm or more than one in case of training and prediction
            -- naming it with the onehotencoded colmName to avoid that uncertainity. '''
            if (rowNo == 1):
                nonOneHotEncoded.append(
                    oneHotEncodedColName
                )  # to avoid the problem when only single value colm is passed at the time of prediction.
            else:
                indexedFeatures.append(indexedColmName)
                oneHotEncodedFeaturesList.append(oneHotEncodedColName)

        oneHotEncoder = OneHotEncoderEstimator(
            inputCols=indexedFeatures,
            outputCols=oneHotEncodedFeaturesList,
            handleInvalid="error")
        oneHotEncoderPath = storageLocation + modelId.upper(
        ) + PredictiveConstants.ONEHOTENCODED.upper(
        ) + PredictiveConstants.PARQUETEXTENSION
        oneHotEncoder.write().overwrite().save(oneHotEncoderPath)
        oneHotEncoderPathMapping.update(
            {PredictiveConstants.ONEHOTENCODED: oneHotEncoderPath})
        oneHotEncoderFit = oneHotEncoder.fit(dataset)
        dataset = oneHotEncoderFit.transform(dataset)

        combinedFeatures = oneHotEncodedFeaturesList + numericalFeatures + nonOneHotEncoded
        categoryColmListDict = {}
        countOfCategoricalColmList = []
        for value in categoricalFeatures:
            listValue = []
            categoryColm = dataset.groupby(value).count()
            countOfCategoricalColmList.append(categoryColm.count())
            categoryColmJson = categoryColm.toJSON()
            for row in categoryColmJson.collect():
                categoryColmSummary = json.loads(row)
                listValue.append(categoryColmSummary)
            categoryColmListDict[value] = listValue

        self.numericalFeatures = numericalFeatures
        self.categoricalFeatures = categoricalFeatures
        if not categoricalFeatures:
            maxCategories = 5
        else:
            maxCategories = max(countOfCategoricalColmList)
        dataset = dataset.drop(vectorizedFeaturescolmName)
        featureassembler = VectorAssembler(
            inputCols=combinedFeatures,
            outputCol=vectorizedFeaturescolmName,
            handleInvalid="skip")
        dataset = featureassembler.transform(dataset)

        # retrieve the features colm name after onehotencoding
        indexOfFeatures = dataset.schema.names.index(
            vectorizedFeaturescolmName)
        oneHotEncodedFeaturesDict = dataset.schema.fields[
            indexOfFeatures].metadata['ml_attr']['attrs']
        idNameFeatures = {}

        if not oneHotEncodedFeaturesDict:
            idNameFeaturesOrderedTemp = None
        else:
            for type, value in oneHotEncodedFeaturesDict.items():
                for subKey in value:
                    idNameFeatures[subKey.get("idx")] = subKey.get("name")
                    idNameFeaturesOrderedTemp = {}
                    for key in sorted(idNameFeatures):
                        idNameFeaturesOrderedTemp[key] = idNameFeatures[
                            key].replace(PredictiveConstants.ONEHOTENCODED_,
                                         "")

        idNameFeaturesOrdered = None if idNameFeaturesOrderedTemp == None else idNameFeaturesOrderedTemp

        # retrieve the label colm name only after label encoding
        indexedLabelNameDict = {}
        if isLabelIndexed == "yes":
            indexOfLabel = dataset.schema.names.index(label)
            indexedLabel = dataset.schema.fields[indexOfLabel].metadata[
                "ml_attr"]["vals"]

            for value in indexedLabel:
                indexedLabelNameDict[indexedLabel.index(value)] = value

        # this code was for vector indexer since it is not stable for now from spark end
        # so will use it in future if needed.
        '''
        vec_indexer = VectorIndexer(inputCol='features', outputCol='vec_indexed_features',
        maxCategories=maxCategories,
                                    handleInvalid="skip").fit(dataset)
        categorical_features = vec_indexer.categoryMaps
        print("Choose %d categorical features: %s" %
              (len(categorical_features), ", ".join(str(k) for k in categorical_features.keys())))
        dataset= vec_indexer.transform(dataset)
        '''

        result = {
            PredictiveConstants.DATASET: dataset,
            PredictiveConstants.CATEGORICALFEATURES: categoricalFeatures,
            PredictiveConstants.NUMERICALFEATURES: numericalFeatures,
            PredictiveConstants.MAXCATEGORIES: maxCategories,
            PredictiveConstants.CATEGORYCOLMSTATS: categoryColmListDict,
            PredictiveConstants.INDEXEDFEATURES: indexedFeatures,
            PredictiveConstants.LABEL: label,
            PredictiveConstants.ONEHOTENCODEDFEATURESLIST:
            oneHotEncodedFeaturesList,
            PredictiveConstants.INDEXEDLABELNAMEDICT: indexedLabelNameDict,
            PredictiveConstants.ISLABELINDEXED: isLabelIndexed,
            PredictiveConstants.VECTORFEATURES: vectorizedFeaturescolmName,
            PredictiveConstants.IDNAMEFEATURESORDERED: idNameFeaturesOrdered,
            PredictiveConstants.INDEXERPATHMAPPING: indexerPathMapping,
            PredictiveConstants.ONEHOTENCODERPATHMAPPING:
            oneHotEncoderPathMapping
        }
        return result
Esempio n. 20
0
# Exercise_1 
# Import the one hot encoder class
from pyspark.ml.feature import OneHotEncoderEstimator

# Create an instance of the one hot encoder
onehot = OneHotEncoderEstimator(inputCols=['org_idx'], outputCols=['org_dummy'])

# Apply the one hot encoder to the flights data
onehot = onehot.fit(flights)
flights_onehot = onehot.transform(flights)

# Check the results
flights_onehot.select('org', 'org_idx', 'org_dummy').distinct().sort('org_idx').show()

--------------------------------------------------
# Exercise_2 
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Create a regression object and train on training data
regression = LinearRegression(labelCol='duration').fit(flights_train)

# Create predictions for the testing data and take a look at the predictions
predictions = regression.transform(flights_test)
predictions.select('duration', 'prediction').show(5, False)

# Calculate the RMSE
RegressionEvaluator(labelCol='duration').evaluate(predictions)

--------------------------------------------------
# Exercise_3 
Esempio n. 21
0
def train_model(args):
    # do not run this test for pytorch lightning below min supported verson
    import pytorch_lightning as pl
    if LooseVersion(pl.__version__) < LooseVersion(MIN_PL_VERSION):
        print("Skip test for pytorch_ligthning=={}, min support version is {}".
              format(pl.__version__, MIN_PL_VERSION))
        return

    # Initialize SparkSession
    conf = SparkConf().setAppName('keras_spark_mnist').set(
        'spark.sql.shuffle.partitions', '16')
    if args.master:
        conf.setMaster(args.master)
    elif args.num_proc:
        conf.setMaster('local[{}]'.format(args.num_proc))
    spark = SparkSession.builder.config(conf=conf).getOrCreate()

    # Setup our store for intermediate data
    store = Store.create(args.work_dir)

    # Download MNIST dataset
    data_url = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/mnist.bz2'
    libsvm_path = os.path.join(args.data_dir, 'mnist.bz2')
    if not os.path.exists(libsvm_path):
        subprocess.check_output(['wget', data_url, '-O', libsvm_path])

    # Load dataset into a Spark DataFrame
    df = spark.read.format('libsvm') \
        .option('numFeatures', '784') \
        .load(libsvm_path)

    # One-hot encode labels into SparseVectors
    encoder = OneHotEncoderEstimator(inputCols=['label'],
                                     outputCols=['label_vec'],
                                     dropLast=False)
    model = encoder.fit(df)
    train_df = model.transform(df)

    # Train/test split
    train_df, test_df = train_df.randomSplit([0.9, 0.1])

    # Define the PyTorch model without any Horovod-specific parameters
    class Net(nn.Module):
        def __init__(self):
            super(Net, self).__init__()
            self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
            self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
            self.conv2_drop = nn.Dropout2d()
            self.fc1 = nn.Linear(320, 50)
            self.fc2 = nn.Linear(50, 10)

        def forward(self, x):
            x = x.float()
            x = F.relu(F.max_pool2d(self.conv1(x), 2))
            x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
            x = x.view(-1, 320)
            x = F.relu(self.fc1(x))
            x = F.dropout(x, training=self.training)
            x = self.fc2(x)
            return F.log_softmax(x)

    model = Net()
    optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
    loss = nn.NLLLoss()

    # Train a Horovod Spark Estimator on the DataFrame
    torch_estimator = hvd.TorchEstimator(
        num_proc=args.num_proc,
        store=store,
        model=model,
        optimizer=optimizer,
        loss=lambda input, target: loss(input, target.long()),
        input_shapes=[[-1, 1, 28, 28]],
        feature_cols=['features'],
        label_cols=['label'],
        batch_size=args.batch_size,
        epochs=args.epochs,
        verbose=1)

    torch_model = torch_estimator.fit(train_df).setOutputCols(['label_prob'])

    # Evaluate the model on the held-out test DataFrame
    pred_df = torch_model.transform(test_df)
    argmax = udf(lambda v: float(np.argmax(v)), returnType=T.DoubleType())
    pred_df = pred_df.withColumn('label_pred', argmax(pred_df.label_prob))
    evaluator = MulticlassClassificationEvaluator(predictionCol='label_pred',
                                                  labelCol='label',
                                                  metricName='accuracy')
    print('Test accuracy:', evaluator.evaluate(pred_df))

    spark.stop()
Esempio n. 22
0
flights = flights.dropna()
print("\nThe data contains %d records after dropping records with na values." % flights.count())

# Create an indexer for carrier categorical feature
indexer = StringIndexer(inputCol="carrier", outputCol='carrier_idx')

# Indexer identifies categories in the data
indexer_model = indexer.fit(flights)

# Indexer creates a new column with numeric index values
flights_indexed = indexer_model.transform(flights)

# Repeat the process for the org categorical feature
flights_indexed = StringIndexer(inputCol="org", outputCol='org_idx').fit(flights_indexed).transform(flights_indexed)
# Check first five records
flights_indexed.show(5)

flites = flights_indexed.select('carrier', 'org', 'org_idx')

# Create an instance of the one hot encoder
onehot = OneHotEncoderEstimator(inputCols=["org_idx"], outputCols=["org_dummy"])

# Apply the one hot encoder to the flights data
onehot = onehot.fit(flites)
flights_onehot = onehot.transform(flites)

# Check the results
flights_onehot.select('org', 'org_idx', 'org_dummy').distinct().sort('org_idx').show()

spark.stop()
# COMMAND ----------

# MAGIC %md-sandbox
# MAGIC Now each room has a unique numerical value assigned.  While we could pass the new `room_type_index` into a machine learning model, it would assume that `Shared room` is twice as much as `Entire home/apt`, which is not the case.  Instead, we need to change these values to a binary yes/no value if a listing is for a shared room, entire home, or private room.
# MAGIC
# MAGIC Do this by training and fitting the `OneHotEncoderEstimator`, which only operates on numerical values (this is why we needed to use `StringIndexer` first).
# MAGIC
# MAGIC <img alt="Side Note" title="Side Note" style="vertical-align: text-bottom; position: relative; height:1.75em; top:0.05em; transform:rotate(15deg)" src="https://files.training.databricks.com/static/images/icon-note.webp"/> Certain models, such as random forest, do not need one-hot encoding (and can actually be negatively affected by the process).  The models we'll explore in this course, however, do need this process.

# COMMAND ----------

from pyspark.ml.feature import OneHotEncoderEstimator

encoder = OneHotEncoderEstimator(inputCols=["room_type_index"],
                                 outputCols=["encoded_room_type"])
encoderModel = encoder.fit(indexedDF)
encodedDF = encoderModel.transform(indexedDF)

display(encodedDF)

# COMMAND ----------

# MAGIC %md
# MAGIC The new column `encoded_room_type` is a vector.  The difference between a sparse and dense vector is whether Spark records all of the empty values.  In a sparse vector, like we see here, Spark saves space by only recording the places where the vector has a non-zero value.  The value of 0 in the first position indicates that it's a sparse vector.  The second value indicates the length of the vector.
# MAGIC
# MAGIC Here's how to read the mapping above:<br><br>
# MAGIC
# MAGIC * `Shared room` maps to the vector `[0, 0]`
# MAGIC * `Entire home/apt` maps to the vector `[0, 1]`
# MAGIC * `Private room` maps to the vector `[1, 0]`
Esempio n. 24
0
indexer = StringIndexer(inputCol='type', outputCol='type_idx')

# Assign index values to strings
indexer = indexer.fit(cars)
# Create column with index values
cars = indexer.transform(cars)

pd.set_option('display.max_columns', None)  # all cols
pd.set_option('display.width', 161)
#print(cars.toPandas().sample(12))

# Check column data types
print('\n', cars.dtypes, '\n')

kars = cars.select('name', 'type', 'type_idx')

print(kars.toPandas().sample(12))

onehot = OneHotEncoderEstimator(inputCols=['type_idx'],
                                outputCols=['type_dummy'])
onehot = onehot.fit(kars)
kars = onehot.transform(kars)
kars.select('type', 'type_idx',
            'type_dummy').distinct().sort('type_idx').show()

print("DenseVector:", DenseVector([1, 0, 0, 0, 0, 7, 0, 0]))
print("SparseVector:", SparseVector(8, {0: 1.0, 5: 7.0}))

spark.stop()
Esempio n. 25
0
# Download MNIST dataset
data_url = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/mnist.bz2'
libsvm_path = os.path.join(args.data_dir, 'mnist.bz2')
if not os.path.exists(libsvm_path):
    subprocess.check_output(['wget', data_url, '-O', libsvm_path])

# Load dataset into a Spark DataFrame
df = spark.read.format('libsvm') \
    .option('numFeatures', '784') \
    .load(libsvm_path)

# One-hot encode labels into SparseVectors
encoder = OneHotEncoderEstimator(inputCols=['label'],
                                 outputCols=['label_vec'],
                                 dropLast=False)
model = encoder.fit(df)
train_df = model.transform(df)

# Train/test split
train_df, test_df = train_df.randomSplit([0.9, 0.1])


# Define the PyTorch model without any Horovod-specific parameters
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.conv2_drop = nn.Dropout2d()
        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)
Esempio n. 26
0
    "Cat7", "Cat8", "Cat9", "Cat10", "Cat11", "Cat12", "OrdCat", "NVCat"
]

cat_output = [column + "_index" for column in cat_input]

cat_hot = [column + "_hot" for column in cat_output]

indexers = [
    StringIndexer(inputCol=column, outputCol=column + "_index").fit(clean)
    for column in cat_input
]
pipeline = Pipeline(stages=indexers)
df_r = pipeline.fit(clean).transform(clean)

encoder = OneHotEncoderEstimator(inputCols=cat_output, outputCols=cat_hot)
hot_data = encoder.fit(df_r).transform(df_r)
df = hot_data.select('Blind_Make_index_hot', 'Model_Year_index_hot', 'Var1',
                     'Var2', 'Var3', 'Var4', 'Var5', 'Var6', 'Var7', 'Var8',
                     'NVVar1', 'NVVar2', 'NVVar3', 'NVVar4', 'Cat11_index_hot',
                     'Cat9_index_hot', 'Cat7_index_hot', 'NVCat_index_hot',
                     'Cat4_index_hot', 'Cat12_index_hot', 'Cat1_index_hot',
                     'Cat5_index_hot', 'Cat3_index_hot', 'Cat8_index_hot',
                     'Cat2_index_hot', 'Cat6_index_hot', 'Cat10_index_hot',
                     'OrdCat_index_hot', 'label', 'Calendar_Year')

assembler = VectorAssembler(inputCols=[
    'Blind_Make_index_hot', 'Model_Year_index_hot', 'Var1', 'Var2', 'Var3',
    'Var4', 'Var5', 'Var6', 'Var7', 'Var8', 'NVVar1', 'NVVar2', 'NVVar3',
    'NVVar4', 'Cat11_index_hot', 'Cat9_index_hot', 'Cat7_index_hot',
    'NVCat_index_hot', 'Cat4_index_hot', 'Cat12_index_hot', 'Cat1_index_hot',
    'Cat5_index_hot', 'Cat3_index_hot', 'Cat8_index_hot', 'Cat2_index_hot',
Esempio n. 27
0
    mc = BinaryClassificationEvaluator(rawPredictionCol="prediction",
                                       labelCol="label")
    cv = CrossValidator(estimator=estimator,
                        estimatorParamMaps=paramGrid,
                        evaluator=mc,
                        numFolds=2)

    # for row in train_df.rdd.collect():
    #     print("row: ", row.uri)
    #     load_image_from_uri(row.uri)

    # cvModel = cv.fit(train_df)
    # mc.evaluate(cvModel.transform(test_df))
    #

    stringIndexer = StringIndexer(inputCol="label_name",
                                  outputCol="categoryIndex")
    indexed_dateset = stringIndexer.fit(train_df).transform(train_df)

    # encoder = OneHotEncoder(inputCol="categoryIndex", outputCol="categoryVec")

    encoder = OneHotEncoderEstimator(inputCols=["categoryIndex"],
                                     outputCols=["categoryVec"])

    encoder_model = encoder.fit(indexed_dateset)

    image_dataset = encoder_model.transform(indexed_dateset)

    image_dataset.show()

    transformers = estimator.fit(image_dataset)
Esempio n. 28
0
stringIndNames = []
encodedNames = []

for v in categFilt:

    encodedNames.append(v+"_indE")
    stringIndNames.append(v+"_ind")

print(categFilt)
print(stringIndNames)
print("\n", encodedNames)

encoder = OneHotEncoderEstimator(inputCols= stringIndNames, outputCols=encodedNames)

insuranceRaw = encoder.fit(insuranceRaw).transform(insuranceRaw)


insuranceRaw.select('Cat1_indE', 'Cat3_indE').show(30)


#vector assemble the categorical features
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=encodedNames, outputCol="catfeatures")

insuranceRaw = assembler.transform(insuranceRaw)

for x in stringIndNames:
    insuranceRaw = insuranceRaw.drop(x)
Esempio n. 29
0
indexer = StringIndexer(inputCol="carrier", outputCol='carrier_idx')

# Indexer identifies categories in the data
indexer_model = indexer.fit(flights)

# Indexer creates a new column with numeric index values
flights_indexed = indexer_model.transform(flights)

# Repeat the process for the org categorical feature
flites = StringIndexer(inputCol="org", outputCol='org_idx').fit(flights_indexed).transform(flights_indexed)

# Create an instance of the one hot encoder
onehot = OneHotEncoderEstimator(inputCols=["org_idx"], outputCols=["org_dummy"])

# Apply the one hot encoder to the flights data
onehot = onehot.fit(flites)
flites = onehot.transform(flites)

pd.set_option('display.max_columns', None) # all cols
pd.set_option('display.width', 199)
pd.set_option('display.max_colwidth', 199)

# Create buckets at 3 hour intervals through the day
buckets = Bucketizer(splits=[0, 3, 6, 9, 12, 15, 18, 21, 24],
                     inputCol="depart", outputCol="depart_bucket")

# Bucket the departure times
bucketed = buckets.transform(flites)
#bucketed.select("depart", "depart_bucket").show(5)

# Create a one-hot encoder for departure
# COMMAND ----------

# MAGIC %md ### Features added
# MAGIC - day - day of month of click
# MAGIC - hour - hour of day of click

# COMMAND ----------

#Encode categorical features (app, device, os, channel)
from pyspark.ml.feature import OneHotEncoderEstimator

encoder = OneHotEncoderEstimator(
    inputCols=["ip", "app", "device", "os", "channel"],
    outputCols=["ipVec", "appVec", "deviceVec", "osVec", "channelVec"])

model = encoder.fit(t0_df)
t0_df = model.transform(t0_df)
t0_df.printSchema()

# COMMAND ----------

from pyspark.ml.feature import VectorAssembler

#Select features to actually use in training
inputCols = [
    "ipVec", "appVec", "deviceVec", "osVec", "channelVec", "day", "hour"
]
vectorAssembler = VectorAssembler(inputCols=inputCols, outputCol="features")

v_t0_df = vectorAssembler.transform(t0_df)
v_t0_df.printSchema()
Esempio n. 31
0
# COMMAND ----------

indexer = StringIndexer(inputCol="ItemFamily", outputCol="ItemFamilyNum").fit(store_num_data)
store_num_data_indexed = indexer.transform(store_num_data)
#store_num_data_indexed.show()

indexer = StringIndexer(inputCol="type", outputCol="typeNum").fit(store_num_data_indexed)
store_num_data_indexed = indexer.transform(store_num_data_indexed)
#store_num_data_indexed.show()


# COMMAND ----------

encoder = OneHotEncoderEstimator(inputCols=["typeNum","Cluster","StoreNum","ItemFamilyNum","Month","Day","dow_number"],
                                 outputCols=["typeNumVec","ClusterVec","StoreNumVec","ItemFamilyNumVec", "MonthVec","DayVec","DOWNum"])
model = encoder.fit(store_num_data_indexed)
store_num_data_ind_enc = model.transform(store_num_data_indexed)
#store_num_data_ind_enc.show(5)

# COMMAND ----------

store_num_data_ind_enc = store_num_data_ind_enc.withColumnRenamed("store_num_data_ind_enc.Date", "Date_Date")

# COMMAND ----------

# MAGIC %md ***Using Linear Regression to create model***

# COMMAND ----------

trainingdf = store_num_data_ind_enc.filter(store_num_data_ind_enc.Date_Date <'2014-04-28')