Exemple #1
0
def vectorize_data(training_data, test_data):
    # Assemble the vectors
    input_columns = training_data.columns
    input_columns.remove(TARGET)
    print("Using these features: {}".format(input_columns))
    vector_assembler = VectorAssembler(inputCols=input_columns, outputCol='features')
    train_df = vector_assembler.transform(training_data)

    # Normalize the data using Scalar
    scalar = StandardScaler(inputCol='features', outputCol='scaledFeatures', withStd=True, withMean=True).fit(train_df)
    train_df = scalar.transform(train_df)

    # Select the rows needed
    train_df = train_df.select(['scaledFeatures', TARGET])

    new_test_data = dict()
    for company in test_data:
        company_data = test_data[company]
        test_df = vector_assembler.transform(company_data)
        test_df = scalar.transform(test_df)

        test_df = test_df.select(['scaledFeatures', TARGET])
        new_test_data[company] = test_df

    return train_df, new_test_data
Exemple #2
0
def preprocess(df, should_undersample, scaler=None):
    """ Escala los datos y balancea usando Random Undersample (RUS) """
    # Agrupar las caracteristicas para poder usarlas en la MLlib:
    assembler = VectorAssembler(inputCols=[
        "PSSM_r1_1_K", "PSSM_r2_-1_R", "PSSM_central_2_D", "PSSM_central_0_A",
        "PSSM_r1_1_W", "PSSM_central_-1_V"
    ],
                                outputCol="features")

    out = assembler.transform(df).select("features", "class")

    # Random Undersample (RUS)
    # Antes: POS = 550.140, NEG = 1.100.591
    # Despues: POS = 550.140, NEG = 549.668
    if should_undersample:
        positive = out.filter(out["class"] == 1.0)
        negative = out.filter(out["class"] == 0.0)
        fraction = float(positive.count()) / float(negative.count())
        negative = negative.sample(withReplacement=False,
                                   fraction=fraction,
                                   seed=89)
        out = negative.union(positive)

    # Escalar:
    if scaler == None:
        scaler = StandardScaler(withMean=True,
                                withStd=True,
                                inputCol="features",
                                outputCol="scaled_features")
        scaler = scaler.fit(out)
        out = scaler.transform(out)
    else:
        out = scaler.transform(out)

    return out, scaler
Exemple #3
0
def standard_scale(dataFrame,
                   inputColNames,
                   usr_withStd=True,
                   usr_withMean=False):

    assembledDF = getAssembledDataFrame(dataFrame, inputColNames)
    scaler=StandardScaler(inputCol="features", \
                          outputCol="scaled features", \
                          withStd=usr_withStd, \
                          withMean=usr_withMean).fit(assembledDF)
    scaledDF = scaler.transform(assembledDF).drop("features")
    return scaledDF
 def scaling(dataFrame, inputColName, usr_withStd, usr_withMean):
     outputColName = "scaled " + inputColName
     assembler = VectorAssembler(inputCols=[inputColName], \
                                 outputCol="features")
     assembledDF = assembler.transform(dataFrame)
     scaler=StandardScaler(inputCol="features", \
                           outputCol=outputColName, \
                           withStd=usr_withStd, \
                           withMean=usr_withMean).fit(assembledDF)
     scaledDF = scaler.transform(assembledDF).drop("features")
     castVectorToFloat = udf(lambda v : float(v[0]), FloatType())
     scaledDF = scaledDF.withColumn(outputColName, castVectorToFloat(outputColName)) 
     print ("Successfully scale the column '{0:s}' and create a new column '{1:s}'.".format(inputColName, outputColName))
     return scaledDF
Exemple #5
0
def preprocess(df):
    df = under_sampling(df)
    indexer = StringIndexer(inputCol="PredSS_central_1",
                            outputCol="PredSS_central_1_indexed")

    assembler = VectorAssembler(inputCols=[
        "PSSM_r1_1_N", "PredSS_central_1_indexed", "AA_freq_central_A",
        "AA_freq_global_H", "PSSM_r1_1_S", "PSSM_r2_-3_Y"
    ],
                                outputCol='features')

    pipeline = Pipeline(stages=[indexer, assembler])
    df_1 = pipeline.fit(df).transform(df).select('features', 'class')
    #df = assembler.transform(df).select('features', 'class')
    #df = df.select('features', 'labels')
    scale = StandardScaler(withMean=True,
                           withStd=True,
                           inputCol='features',
                           outputCol='scaled_features')
    scale = scale.fit(df_1)
    df_1 = scale.transform(df_1)
    return df_1
Exemple #6
0
featuresForScale =  [x for x in mlSourceDFCat.columns if 'Lag' in x]
print(len(featuresForScale))
assembler = VectorAssembler(
  inputCols=featuresForScale, outputCol="features"
)

assembled = assembler.transform(mlSourceDFCat).select(col('key'), col('features'))

scaler = StandardScaler(
  inputCol="features", outputCol="scaledFeatures",
  withStd=True, withMean=False
).fit(assembled)

scaler.write().overwrite().save(featureScaleModelFile)

scaledData = scaler.transform(assembled).select('key','scaledFeatures')
def extract(row):
    return (row.key, ) + tuple(float(x) for x in row.scaledFeatures.values)

rdd = scaledData.rdd.map(lambda x: Row(key=x[0],scaledFeatures=DenseVector(x[1].toArray())))
scaledDf = rdd.map(extract).toDF(["key"])
# rename columns
oldColumns = scaledDf.columns
scaledColumns = ['scaledKey']
scaledColumns.extend(['scaled'+str(i) for i in featuresForScale])
scaledOutcome = scaledDf.select([col(oldColumns[index]).alias(scaledColumns[index]) for index in range(0,len(oldColumns))])
noScaledMLSourceDF = mlSourceDFCat.select([column for column in mlSourceDFCat.columns if column not in featuresForScale])
newDF = noScaledMLSourceDF.join(scaledOutcome, noScaledMLSourceDF.key==scaledOutcome.scaledKey, 'outer')
newDF.cache()
mlSourceDFCat = newDF
mlSourceDFCat=mlSourceDFCat.fillna(0, subset= [x for x in mlSourceDFCat.columns if 'Lag' in x])
Exemple #7
0
# create vector test_df
assembled_test = assembler.transform(test_df).drop(
    "CRS_DEP_TIME", "DISTANCE", 'vis_distance', 'tmp', 'dew', 'elevation',
    'dest_wnd_speed', 'pagerank', 'pagerank_dest', 'wnd_speed', 'cig_height',
    'dest_vis_distance', 'dest_tmp', 'dest_dew', 'dest_elevation',
    'dest_cig_height')

# COMMAND ----------

# DBTITLE 1,Scale Continuous Features
# scale train
scaler = StandardScaler(inputCol="features",
                        outputCol="scaledFeatures",
                        withStd=True,
                        withMean=True).fit(assembled_train)
assembled_train = scaler.transform(assembled_train).drop('features')
assembled_train = _convert_vector(assembled_train, 'float32')

# scale val
assembled_val = scaler.transform(assembled_val).drop('features')
assembled_val = _convert_vector(assembled_val, 'float32')

# scale test
assembled_test = scaler.transform(assembled_test).drop('features')
assembled_test = _convert_vector(assembled_test, 'float32')

# COMMAND ----------

# check partition size
assembled_val.rdd.getNumPartitions()
Exemple #8
0
### NEEED SCALING NOW

#### Scaling between 0-1 for continuous variables
# Data needs to be scaled to a small range like 0 to 1 for the neural
# network to work well.

vectorAssembler = VectorAssembler(inputCols=selected_covariates_names_updated,
                                  outputCol='features')
vtraining_df = vectorAssembler.transform(training_spark_df)

from pyspark.ml.feature import StandardScaler
standardscaler = StandardScaler().setInputCol("features").setOutputCol(
    "Scaled_features")
standardscaler = standardscaler.fit(training_df)
training_df = standardscaler.transform(training_df)
testing_df = standardscaler.transform(tesing_df)

#raw_data.select("features","Scaled_features").show(5)

###########################################
### PART IV: Run model and perform assessment ###########################

training_spark_df = sqlContext.createDataFrame(X_y_training_df)

#https://www.guru99.com/pyspark-tutorial.html
#https://towardsdatascience.com/building-a-linear-regression-with-pyspark-and-mllib-d065c3ba246a

vectorAssembler = VectorAssembler(inputCols=selected_covariates_names_updated,
                                  outputCol='features')
vtraining_df = vectorAssembler.transform(training_spark_df)