Example #1
0
# feature scaling for numeric features
featuresForScale =  [x for x in mlSourceDFCat.columns if 'Lag' in x]
print(len(featuresForScale))
assembler = VectorAssembler(
  inputCols=featuresForScale, outputCol="features"
)

assembled = assembler.transform(mlSourceDFCat).select(col('key'), col('features'))

scaler = StandardScaler(
  inputCol="features", outputCol="scaledFeatures",
  withStd=True, withMean=False
).fit(assembled)

scaler.write().overwrite().save(featureScaleModelFile)

scaledData = scaler.transform(assembled).select('key','scaledFeatures')
def extract(row):
    return (row.key, ) + tuple(float(x) for x in row.scaledFeatures.values)

rdd = scaledData.rdd.map(lambda x: Row(key=x[0],scaledFeatures=DenseVector(x[1].toArray())))
scaledDf = rdd.map(extract).toDF(["key"])
# rename columns
oldColumns = scaledDf.columns
scaledColumns = ['scaledKey']
scaledColumns.extend(['scaled'+str(i) for i in featuresForScale])
scaledOutcome = scaledDf.select([col(oldColumns[index]).alias(scaledColumns[index]) for index in range(0,len(oldColumns))])
noScaledMLSourceDF = mlSourceDFCat.select([column for column in mlSourceDFCat.columns if column not in featuresForScale])
newDF = noScaledMLSourceDF.join(scaledOutcome, noScaledMLSourceDF.key==scaledOutcome.scaledKey, 'outer')
newDF.cache()