# real example



users_noscaled=users_addedmonths

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import MinMaxScaler

#call the vector assembler
assembler = VectorAssembler(
  inputCols=users_noscaled.columns[7:], outputCol='assembled_col'
)

#call the scaler
scaler = MinMaxScaler(
  inputCol="assembled_col", outputCol="assembled_col_norm"
)

#build an assembleed vector in the dataframe
assembled=assembler.transform(users_noscaled)

#build the scaler model
scaler_model= scaler.fit(assembled)

#Apply the model to the transformed dataframe
users_wscaled=scaler_model.transform(assembled)

Exemple #2
0
    def scaleVecCol(self, columns, nameOutputCol):
        """
        This function groups the columns specified and put them in a list array in one column, then a scale
        process is made. The scaling proccedure is spark scaling default (see the example
        bellow).

        +---------+----------+
        |Price    |AreaLiving|
        +---------+----------+
        |1261706.9|16        |
        |1263607.9|16        |
        |1109960.0|19        |
        |978277.0 |19        |
        |885000.0 |19        |
        +---------+----------+

                    |
                    |
                    |
                    V
        +----------------------------------------+
        |['Price', 'AreaLiving']                 |
        +----------------------------------------+
        |[0.1673858972637624,0.5]                |
        |[0.08966137157852398,0.3611111111111111]|
        |[0.11587093205757598,0.3888888888888889]|
        |[0.1139820728616421,0.3888888888888889] |
        |[0.12260126542983639,0.4722222222222222]|
        +----------------------------------------+
        only showing top 5 rows

        """

        # Check if columns argument must be a string or list datatype:
        self.__assertTypeStrOrList(columns, "columns")

        # Check if columns to be process are in dataframe
        self.__assertColsInDF(columnsProvided=columns, columnsDF=self.__df.columns)

        # Check if nameOutputCol argument a string datatype:
        self.__assertTypeStr(nameOutputCol, "nameOutpuCol")

        # Model to use vectorAssember:
        vecAssembler = VectorAssembler(inputCols=columns, outputCol="features_assembler")
        # Model for scaling feature column:
        mmScaler = MinMaxScaler(inputCol="features_assembler", outputCol=nameOutputCol)
        # Dataframe with feature_assembler column
        tempDF = vecAssembler.transform(self.__df)
        # Fitting scaler model with transformed dataframe
        model = mmScaler.fit(tempDF)

        exprs = list(filter(lambda x: x not in columns, self.__df.columns))

        exprs.extend([nameOutputCol])

        self.__df = model.transform(tempDF).select(*exprs)
        self.__addTransformation()  # checkpoint in case

        return self

###columns
sparkDF.columns

# UDF for converting column type from vector to double type
unlist = udf(lambda x: round(float(list(x)[0]),3), DoubleType())



assembler = VectorAssembler(inputCols=['_c78'],outputCol = "hell_Vect")

newDf = assembler.transform(sparkDF)
    
scaler = MinMaxScaler(inputCol="hell_Vect", outputCol="_Scaled")
scalerModel = scaler.fit(newDf)

# rescale each feature to range [min, max].
scaledData = scalerModel.transform(newDf)


def normaliseEntireDf(sparkDf):
    
    origColumns = sparkDf.columns
    
    for i in origColumns:
        # VectorAssembler Transformation - Converting column to vector type
        assembler = VectorAssembler(inputCols=[i],outputCol=i+"_Vect")
    
        # MinMaxScaler Transformation
        scaler = MinMaxScaler(inputCol=i+"_Vect", outputCol=i+"_Scaled")
Exemple #4
0
from pyspark.ml.feature import QuantileDiscretizer
bucketer = QuantileDiscretizer().setNumBuckets(5).setInputCol("id")
fittedBucketer = bucketer.fit(contDF)
fittedBucketer.transform(contDF).show()

# COMMAND ----------

from pyspark.ml.feature import StandardScaler
sScaler = StandardScaler().setInputCol("features")
sScaler.fit(scaleDF).transform(scaleDF).show()

# COMMAND ----------

from pyspark.ml.feature import MinMaxScaler
minMax = MinMaxScaler().setMin(5).setMax(10).setInputCol("features")
fittedminMax = minMax.fit(scaleDF)
fittedminMax.transform(scaleDF).show()

# COMMAND ----------

from pyspark.ml.feature import MaxAbsScaler
maScaler = MaxAbsScaler().setInputCol("features")
fittedmaScaler = maScaler.fit(scaleDF)
fittedmaScaler.transform(scaleDF).show()

# COMMAND ----------

from pyspark.ml.feature import ElementwiseProduct
from pyspark.ml.linalg import Vectors
scaleUpVec = Vectors.dense(10.0, 15.0, 20.0)
scalingUp = ElementwiseProduct()\
Exemple #5
0
def train_scaler(df, inputCol, outputCol):
    scaler = MinMaxScaler(inputCol=inputCol, outputCol=outputCol)
    return scaler.fit(df)
Exemple #6
0
    dataFrame = spark.createDataFrame([(
        0,
        Vectors.dense([1.0, 0.1, -8.0]),
    ), (
        1,
        Vectors.dense([2.0, 1.0, -4.0]),
    ), (
        2,
        Vectors.dense([4.0, 10.0, 8.0]),
    )], ["id", "features"])
    dataFrame.show()

    scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")

    # Compute summary statistics and generate MinMaxScalerModel
    scalerModel = scaler.fit(dataFrame)

    # rescale each feature to range [min, max].
    scaledData = scalerModel.transform(dataFrame)
    print("Features scaled to range: [%f, %f]" %
          (scaler.getMin(), scaler.getMax()))
    scaledData.select("features", "scaledFeatures").show(10, False)
    # $example off$

    scaler = MaxAbsScaler(inputCol="features", outputCol="scaledFeatures")

    # Compute summary statistics and generate MinMaxScalerModel
    scalerModel = scaler.fit(dataFrame)

    # rescale each feature to range [min, max].
    scaledData = scalerModel.transform(dataFrame)
'''

#####################
## NEURAL NETWORKS ##
#####################

########################
## RESCALING DATA SET ##
########################
# Typically for Neural Networks to perform better 
# a lot of preprocessing has to go into the data
# So I scaled the feature space to have min = 0 and max = 1

scaler = MinMaxScaler(inputCol='features', outputCol='scaledFeatures')

scalerModel = scaler.fit(df)

scaledData = scalerModel.transform(df)

print("Features scaled to range: [%f, %f]" % (scaler.getMin(), scaler.getMax()))

scaledData.select("features", "scaledFeatures").show()

new_df = scaledData.selectExpr("label", "radius_mean", "texture_mean", 
	"perimeter_mean", "area_mean", "smoothness_mean", "compactness_mean",
	 "concavity_mean", "concave_points_mean", "symmetry_mean", 
	 "fractal_dimension_mean", "radius_se", "texture_se", "perimeter_se", 
	 "area_se", "smoothness_se", "compactness_se", "concavity_se", 
	 "concave_points_se", "symmetry_se", "fractal_dimension_se", 
	 "radius_worst", "texture_worst", "perimeter_worst", 
	 "area_worst", "smoothness_worst", "compactness_worst", 
Exemple #8
0
def MLClassifierDFPrep(df,
                       input_columns,
                       dependent_var,
                       treat_outliers=True,
                       treat_neg_values=True):

    # change label (class variable) to string type to prep for reindexing
    # Pyspark is expecting a zero indexed integer for the label column.
    # Just incase our data is not in that format... we will treat it by using the StringIndexer built in method
    renamed = df.withColumn("label_str", df[dependent_var].cast(
        StringType()))  #Rename and change to string type
    indexer = StringIndexer(
        inputCol="label_str",
        outputCol="label")  #Pyspark is expecting the this naming convention
    indexed = indexer.fit(renamed).transform(renamed)
    print(indexed.groupBy("class", "label").count().show(100))

    # Convert all string type data in the input column list to numeric
    # Otherwise the Algorithm will not be able to process it
    numeric_inputs = []
    string_inputs = []
    for column in input_columns:
        if str(indexed.schema[column].dataType) == 'StringType':
            indexer = StringIndexer(inputCol=column, outputCol=column + "_num")
            indexed = indexer.fit(indexed).transform(indexed)
            new_col_name = column + "_num"
            string_inputs.append(new_col_name)
        else:
            numeric_inputs.append(column)

    if treat_outliers == True:
        print("We are correcting for non normality now!")
        # empty dictionary d
        d = {}
        # Create a dictionary of quantiles
        for col in numeric_inputs:
            d[col] = indexed.approxQuantile(
                col, [0.01, 0.99], 0.25
            )  #if you want to make it go faster increase the last number
        #Now fill in the values
        for col in numeric_inputs:
            skew = indexed.agg(skewness(
                indexed[col])).collect()  #check for skewness
            skew = skew[0][0]
            # This function will floor, cap and then log+1 (just in case there are 0 values)
            if skew > 1:
                indexed = indexed.withColumn(
                    col,
                    log(
                        when(df[col] < d[col][0], d[col][0]).when(
                            indexed[col] > d[col][1], d[col][1]).otherwise(
                                indexed[col]) + 1).alias(col))
                print(
                    col +
                    " has been treated for positive (right) skewness. (skew =)",
                    skew, ")")
            elif skew < -1:
                indexed = indexed.withColumn(
                    col,
                    exp(
                        when(df[col] < d[col][0], d[col][0]).when(
                            indexed[col] > d[col][1],
                            d[col][1]).otherwise(indexed[col])).alias(col))
                print(
                    col +
                    " has been treated for negative (left) skewness. (skew =",
                    skew, ")")

    # Produce a warning if there are negative values in the dataframe that Naive Bayes cannot be used.
    # Note: we only need to check the numeric input values since anything that is indexed won't have negative values
    minimums = df.select([
        min(c).alias(c) for c in df.columns if c in numeric_inputs
    ])  # Calculate the mins for all columns in the df
    min_array = minimums.select(array(numeric_inputs).alias(
        "mins"))  # Create an array for all mins and select only the input cols
    df_minimum = min_array.select(array_min(
        min_array.mins)).collect()  # Collect golobal min as Python object
    df_minimum = df_minimum[0][0]  # Slice to get the number itself

    features_list = numeric_inputs + string_inputs
    assembler = VectorAssembler(inputCols=features_list, outputCol='features')
    output = assembler.transform(indexed).select('features', 'label')

    #     final_data = output.select('features','label') #drop everything else

    # Now check for negative values and ask user if they want to correct that?
    if df_minimum < 0:
        print(" ")
        print(
            "WARNING: The Naive Bayes Classifier will not be able to process your dataframe as it contains negative values"
        )
        print(" ")

    if treat_neg_values == True:
        print(
            "You have opted to correct that by rescaling all your features to a range of 0 to 1"
        )
        print(" ")
        print("We are rescaling you dataframe....")
        scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")

        # Compute summary statistics and generate MinMaxScalerModel
        scalerModel = scaler.fit(output)

        # rescale each feature to range [min, max].
        scaled_data = scalerModel.transform(output)
        final_data = scaled_data.select(
            'label', 'scaledFeatures')  # added class to the selection
        final_data = final_data.withColumnRenamed('scaledFeatures', 'features')
        print("Done!")

    else:
        print(
            "You have opted not to correct that therefore you will not be able to use to Naive Bayes classifier"
        )
        print("We will return the dataframe unscaled.")
        final_data = output

    return final_data
Exemple #9
0
input_data = df.rdd.map(lambda x: (x[0], DenseVector(x[1:])))

df_input = spark.createDataFrame(input_data, ["label", "features"])
df_input

# COMMAND ----------

# MAGIC %md ## split data into training and test data

# COMMAND ----------

from pyspark.ml.feature import MinMaxScaler
# Initialize the `standardScaler`
scaler = MinMaxScaler(inputCol="features", outputCol="features_scaled")
# Fit the DataFrame to the scaler
scaler = scaler.fit(df_input)

# COMMAND ----------

# Transform the data in `df` with the scaler
scaled_df = scaler.transform(df_input)
scaled_df.first()

# COMMAND ----------

train_data, test_data = scaled_df.randomSplit([.8, .2], seed=7)

from pyspark.ml.classification import RandomForestClassifier, LogisticRegression

# Initialize `lr`
lr = LogisticRegression(labelCol="label",
Exemple #10
0
data.printSchema()

data.head()

closeDF = data.select("close")
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=["close"], outputCol="features")
closeAss = assembler.transform(closeDF)
closeAss.show(40)

from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import MinMaxScaler

MinMaxScalerizer = MinMaxScaler().setMin(0).setMax(100).setInputCol(
    "features").setOutputCol("MinMax_Scaled_features")
input_data = MinMaxScalerizer.fit(closeAss).transform(closeAss).select(
    "MinMax_Scaled_features").collect()
l = len(input_data)

import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

batch_Size, window_Size, hidden_layer, learning_rate, epochs, clip_margin = 50, 50, 256, 0.001, 200, 4
inputs = tf.placeholder(tf.float32, [batch_Size, window_Size, 1])
targets = tf.placeholder(tf.float32, [batch_Size, 1])


def create_input():
    X = []
    Y = []
    i = 0
    while (i + window_Size) <= len(input_data) - 1:
Exemple #11
0
spark = SparkSession \
    .builder \
    .appName("KMeans") \
    .config("spark.some.config.option", "Angadpreet-KMeans") \
    .getOrCreate()
today = dt.datetime.today()
spark_df = sc.parallelize(
    spark.read.json("Data/yelp_academic_dataset_user.json").select(
        "review_count", "average_stars", "yelping_since").rdd.map(lambda x: (x[
            0], x[1], (today - par.parse(x[2])).days)).collect()[:1700])
scaler = MinMaxScaler(inputCol="_1",\
         outputCol="scaled_1")
# Getting the input data
trial_df = spark_df.map(lambda x: pyspark.ml.linalg.Vectors.dense(x)).map(
    lambda x: (x, )).toDF()
scalerModel = scaler.fit(trial_df)
vector_df = scalerModel.transform(trial_df).select("scaled_1").rdd.map(
    lambda x: Vectors.dense(x))

# Initialize GMM
gmm = GaussianMixture.train(vector_df, k=4, maxIterations=20, seed=2018)

df = pandas.DataFrame({'features': [], 'cluster': []})
i = 0
for v in vector_df.collect():
    df.loc[i] = [[float(v[0]), float(v[1]), float(v[2])], int(gmm.predict(v))]
    i += 1

print df

df_with = spark.createDataFrame(
# Filtering
emp_mgr_df = emp_df.filter("salary >= 100000")
# print(emp_mgr_df.count())

# Choosing one column
print(emp_mgr_df.select("salary").show())

# Data transformations

# Normalization
from pyspark.ml.feature import MinMaxScaler

feature_scaler = MinMaxScaler(inputCol="features",
                              outputCol="normalized_features")
normalized_model = feature_scaler.fit(dataset=features_df)
normalized_features_df = normalized_model.transform(features_df)
print(normalized_features_df.take(1))

# Standardization
from pyspark.ml.feature import StandardScaler

feature_scaler = StandardScaler(inputCol="features",
                                outputCol="scaled_features",
                                withStd=True,
                                withMean=True)
std_model = feature_scaler.fit(features_df)
scaled_feature_df = std_model.transform(features_df)
print(scaled_feature_df.take(1))

# Bucketing
Exemple #13
0
def scale(df, feature_name):
    scaler = MinMaxScaler(inputCol="FeatureVector_unscaled_" + feature_name,
                          outputCol="FeatureVector_" + feature_name)
    df = scaler.fit(df).transform(df)
    return df
Exemple #14
0
print('\nHere are the first 40 instances:\n\n')
print(data.show(40))

####################################################################################
## part 2
print('*' * 100)
print('Part 2 - Normalize features between 0 and 1\n')

# assemble features values into a vector and create a feature containing those vectors
assembler = VectorAssembler().setInputCols(
    data.columns[1:]).setOutputCol('features')
transformed = assembler.transform(data)

# create scaler object, transform feature vectors and add scaledFeatures column
scaler = MinMaxScaler(inputCol='features', outputCol='scaledFeatures')
scalerModel = scaler.fit(transformed.select('features'))
scaledData = scalerModel.transform(transformed)

print('Features scaled to range: {} to {}'.format(scaler.getMin(),
                                                  scaler.getMax()))
# print(scaledData.select('_c0','features','scaledFeatures').show(10))

# limit dataset to label and scaled vectors
scaledData = scaledData.select('_c0', 'scaledFeatures')

# rename columns
scaledData = scaledData.withColumnRenamed('_c0', 'label').withColumnRenamed(
    'scaledFeatures', 'features')
print(scaledData.show(5))

####################################################################################

# ### Scaling

# In[60]:


#Applying Min-Max scaling
from pyspark.ml.feature import MinMaxScaler
mm_scaler = MinMaxScaler(inputCol="features", outputCol="minmax_scaled_features")


# In[61]:


mm = mm_scaler.fit(df_vect)
df_scale = mm.transform(df_vect)
df_scale.select("minmax_scaled_features", "success_failure").limit(5).toPandas()


# ### Divinding the dataset

# In[62]:


df_train, df_test = df_scale.randomSplit(weights=[0.7, 0.3], seed=1)
print("Number of observation in train-",df_train.count())
print("Number of observation in test-",df_test.count())
#df_train.count(), df_test.count()

Exemple #16
0
               row["sflow_bpackets"], row["sflow_bbytes"], row["fpsh_cnt"],
               row["bpsh_cnt"], row["furg_cnt"], row["burg_cnt"],
               row["total_fhlen"], row["total_bhlen"], row["dscp"]
           ]))
    return obj


fluxoRDD4 = fluxoDF.rdd.map(transformaVar)

fluxoDF = spSession.createDataFrame(fluxoRDD4, ["rotulo", "atributos"])

scaler = MinMaxScaler(inputCol="atributos",
                      outputCol="scaledFeatures",
                      min=0.0,
                      max=1.0)
scalerModel = scaler.fit(fluxoDF)
scaledData = scalerModel.transform(fluxoDF)

# Indexação é pré-requisito para Decision Trees
stringIndexer = StringIndexer(inputCol="rotulo", outputCol="indexed")
si_model = stringIndexer.fit(scaledData)
obj_final = si_model.transform(scaledData)

X = np.array(obj_final.select("scaledFeatures").collect())
y = np.array(obj_final.select("indexed").collect())

#mudar a dimensão da matriz de atributos para 2d
nsamples, nx, ny = X.shape
d2_X = X.reshape((nsamples, nx * ny))

# Criando o modelo
Exemple #17
0
print("# Correlation matrix:")
print("###########################################")

print(df_covid_count.corr())

###########################################
# Prepare spark model:
###########################################

df_clean = spark.createDataFrame(df_covid_count)
assembler = VectorAssembler().setInputCols(
    ["COVID_COUNT", "FAVS_PER_TWEET", "RT_PER_TWEET",
     "TWEETS_PER_HOUR"]).setOutputCol("IND_VARS")
df_clean_assmbl = assembler.transform(df_clean)
scaler = MinMaxScaler(inputCol="IND_VARS", outputCol="SCALED_IND_VARS")
scaler_model = scaler.fit(df_clean_assmbl.select("IND_VARS"))
scaled_data = scaler_model.transform(df_clean_assmbl)
#scaled_data.show(3)

# split data
splits = scaled_data.randomSplit([0.7, 0.3], 1)
df_train = splits[0]
df_test = splits[1]

# LR model:
lr = LinearRegression(featuresCol="SCALED_IND_VARS",
                      labelCol="MEAN_SENT_POLARITY",
                      maxIter=10,
                      regParam=0.3,
                      elasticNetParam=0.8)
lr_model = lr.fit(df_train)
    nb_classify(training, testing, training.schema.names[0],
                training.schema.names[1])
    rf_classify(training, testing, training.schema.names[0],
                training.schema.names[1])

    knn_classify(training, testing, training.schema.names[0],
                 training.schema.names[1])
    end_time = datetime.datetime.now()
    time_take = int((end_time - start_time).total_seconds())
    print("time taken: ", time_take, " seconds")
    print()
    # training PCA features
    print("------ result of PCA features ------")
    scaler = MinMaxScaler(inputCol=training.schema.names[2],
                          outputCol="scaledPCAFeatures")
    scalerModel = scaler.fit(training)
    training = scalerModel.transform(training)
    testing = scalerModel.transform(testing)

    # naive bayes cant deal with negative input features we skip PCA features here
    start_time = datetime.datetime.now()
    nb_classify(training, testing, training.schema.names[0],
                "scaledPCAFeatures")
    rf_classify(training, testing, training.schema.names[0],
                training.schema.names[2])
    knn_classify(training, testing, training.schema.names[0],
                 training.schema.names[2])
    end_time = datetime.datetime.now()
    time_take = int((end_time - start_time).total_seconds())
    print("time taken: ", time_take, " seconds")
Exemple #19
0
rfm_seg = rfm_seg.withColumn("m_seg", M_udf("Monetary"))
rfm_seg.show(5)

rfm_seg = rfm_seg.withColumn('RFMScore',
                             F.concat(F.col('r_seg'), F.col('f_seg'), F.col('m_seg')))
rfm_seg.sort(F.col('RFMScore')).show(5)

# statistical summary
simple_summary = rfm_seg.groupby('RFMScore').agg({"Recency": "mean", "Frequency": "mean", "Monetary": "mean"}).sort(
    F.col('RFMScore'))

# Extension: apply k-means clustering section to do the segmentation
from pyspark.ml.linalg import Vectors


def transData(df):
    return df.rdd.map(lambda r: [r[0], Vectors.dense(r[1:])]).toDF(['CustomerID', 'rfm'])


transformed_df = transData(rfm)

# scale the feature matrix
from pyspark.ml.feature import MinMaxScaler

scaler = MinMaxScaler(inputCol='rfm', outputCol="features")
scalerModel = scaler.fit(transformed_df)
scaledData = scalerModel.transform(transformed_df)
scaledData.show(5, False)

# K-means clustering
print 'labeled_data : \n', labeled_data.take(10)

#Part 3
'''
Choose two features and generate a heat map for each feature on grey scale and shows variation of each feature across 40 sample instances.
Normalize features between 0 and 1 with 1 representing darkest shade in heat map.
Experiment with minmaxscaler : https://spark.apache.org/docs/latest/ml-features.html#minmaxscaler
'''

lines = data.map(lambda line: line.split(','))
data_transformed = lines.map(lambda line: (line[0], Vectors.dense(line[1:])))
data_labeled_df = sqlContext.createDataFrame(data_transformed,
                                             ["label", "features"])
scaler = MinMaxScaler(inputCol="features", outputCol="scaled_features")
scaler_model = scaler.fit(data_labeled_df.limit(40))
scaled_data = scaler_model.transform(data_labeled_df)

print 'Labeled DF : \n', data_labeled_df.show(4)
scaled_data.select("features", "scaled_features").show(4)
scaled_data.show(1, False)

#Select any two features and plot heat map
heatmap1 = np.asarray(
    data_labeled_df.rdd.map(
        lambda r: (float(r.features[1]), float(r.features[1]))).take(40))
plt.imshow(heatmap1, cmap='gray')
plt.show()

heatmap2 = np.asarray(
    scaled_data.rdd.map(lambda r:
    timesSvd = []
    vocabSizes = []
    #change the min count of words in order to change the vocabulary size
    for minCnt in range(3,19,3): 
        ##########     WORD2VEC     ###########
        word2Vec = Word2Vec(vectorSize=vecSize, minCount=minCnt, windowSize=10, inputCol="hashtags", outputCol="result")
        trainDf = dfW2v.select("hashtags")
        modelW2v = word2Vec.fit(trainDf)
        resultW2v = modelW2v.transform(dfW2v)
        vocabularySize = modelW2v.getVectors().count()
        print("\n" + str(count) + ".vector size " + str(vecSize) + ", minCount " + str(minCnt) + ", vocabulary_size " + str(vocabularySize))
        ###### MINMAXSCALE #########
        print("scaling the data.....")
        data = resultW2v.withColumnRenamed("result","w2vVector")
        scaler = MinMaxScaler(inputCol="w2vVector", outputCol="scaledFeatures")
        scalerModel = scaler.fit(data)
        scaledData = scalerModel.transform(data)

        tokens = []
        for user in scaledData.select("screen_name","scaledFeatures").collect():
            tokens.append(user[1])
        
        #PCA 
        print("running PCA from Sklearn....")
        start = time.time()
        pcaModel = sklearnPCA(n_components=2)
        pcaValues = pcaModel.fit_transform(tokens)
        end = time.time()
        timePcaSklearn = end - start
        print("ended PCA from Sklearn....")
            
# _*_ coding:utf-8 _*_
'''
MinMaxScaler
'''

from pyspark.sql import SparkSession
from pyspark.ml.feature import MinMaxScaler

spark = SparkSession.builder.appName("MinMaxScaler").getOrCreate()

paths = "/export/home/ry/spark-2.2.1-bin-hadoop2.7/data/mllib/"

dataframe = spark.read.format("libsvm").load(
    paths + "sample_isotonic_regression_libsvm_data.txt")

scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")

scalerModel = scaler.fit(dataframe)

scaledData = scalerModel.transform(dataframe)

scaledData.show()
        outputCol="features")

    assembled_train = assembler.transform(train_data)
    assembled_train.select("features", "Class").show(truncate=False)
    training_set = assembled_train.select("features", "Class")

    #Split de los datos
    train_final, test_final = training_set.randomSplit([0.80, 0.20], seed = 13)
    train_final.describe().show() 
    test_final.describe().show()
	
    train_final = train_final.selectExpr("Class as label", "features as features")
    test_final = test_final.selectExpr("Class as label", "features as features")
    
    scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")
    scalerModel = scaler.fit(train_final)
    scaledTData = scalerModel.transform(train_final)
    scaledTData = scaledTData.select("label", "scaledFeatures")
    scaledTData = scaledTData.selectExpr("label as label", "scaledFeatures as features")

    scalerModel = scaler.fit(test_final)
    scaledFData = scalerModel.transform(test_final)
    scaledFData = scaledFData.select("label", "scaledFeatures")
    scaledFData = scaledFData.selectExpr("label as label", "scaledFeatures as features")

    #Clasificador 2
    nb = NaiveBayes(smoothing=1.3, modelType="multinomial")

    # train the model
    model = nb.fit(scaledTData)
selectedCols = ['label', 'features'] + final_data.columns
df = df.select(selectedCols)
#df.printSchema()

# ## Random Forest Classification

from pyspark.ml.classification import RandomForestClassifier

### MinMax Scaling
from pyspark.ml.feature import MinMaxScaler

scaler = MinMaxScaler(inputCol='features', outputCol='scaledfeatures')

start_time = time.time()

scalermodel = scaler.fit(df)
scalerdata = scalermodel.transform(df)

end_time = time.time()
print("total time taken for Scaling loop in seconds: ", end_time - start_time)

train, test = scalerdata.randomSplit([0.8, 0.2])
start_time = time.time()
rf = RandomForestClassifier(featuresCol="scaledfeatures",
                            labelCol="label",
                            predictionCol="prediction",
                            probabilityCol="probability",
                            rawPredictionCol="rawPrediction")
rfModel = rf.fit(train)
end_time = time.time()
print("total time taken to run rf in seconds: ", end_time - start_time)
Exemple #25
0
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.linalg import Vectors

features_df = spark.createDataFrame([(
    1,
    Vectors.dense([10.0, 10000.0, 1.0]),
), (
    2,
    Vectors.dense([20.0, 30000.0, 2.0]),
), (
    3,
    Vectors.dense([30.0, 40000.0, 3.0]),
)], ['id', 'features'])
features_df.take(1)
feature_scaler = MinMaxScaler(inputCol='features', outputCol='sfeatures')
smodel = feature_scaler.fit(features_df)
sfeatures_df = smodel.transform(features_df)
sfeatures_df.take(1)
sfeatures_df.select("features", "sfeatures").show()

# 2.3 Stardardize numeric data

# In[31]:

from pyspark.ml.feature import StandardScaler
from pyspark.ml.linalg import Vectors

features_df = spark.createDataFrame([(
    1,
    Vectors.dense([10.0, 10000.00, 1.0]),
), (
fittedBucketer = bucketer.fit(contDF)
fittedBucketer.transform(contDF).show()


# COMMAND ----------

from pyspark.ml.feature import StandardScaler
sScaler = StandardScaler().setInputCol("features")
sScaler.fit(scaleDF).transform(scaleDF).show()


# COMMAND ----------

from pyspark.ml.feature import MinMaxScaler
minMax = MinMaxScaler().setMin(5).setMax(10).setInputCol("features")
fittedminMax = minMax.fit(scaleDF)
fittedminMax.transform(scaleDF).show()


# COMMAND ----------

from pyspark.ml.feature import MaxAbsScaler
maScaler = MaxAbsScaler().setInputCol("features")
fittedmaScaler = maScaler.fit(scaleDF)
fittedmaScaler.transform(scaleDF).show()


# COMMAND ----------

from pyspark.ml.feature import ElementwiseProduct
from pyspark.ml.linalg import Vectors
Exemple #27
0
df_features
DataFrame[id:bigint, features:vector]
df_features.printSchema()
# root
#  |-- id: long (nullable = true)
#  |-- features: vector (nullable = true)
df_features.count()
# 3
df_features.show()
# +---+------------------+
# | id|          features|
# +---+------------------+
# |  1|[10.0,10000.0,1.0]|
# |  2|[20.0,30000.0,2.0]|
# |  3|[30.0,40000.0,3.0]|
# +---+------------------+
df_features.take(1)
# [Row(id=1, features=DenseVector([10.0, 10000.0, 1.0]))]
df_features.take(2)
# [Row(id=1, features=DenseVector([10.0, 10000.0, 1.0])), Row(id=2, features=DenseVector([20.0, 30000.0, 2.0]))]
featureScaler = MinMaxScaler(inputCol="features", outputCol="sfeatures")
smodel = featureScaler.fit(df_features)
dfSfeatures.show(10, False)
# +---+------------------+----------------------------+
# |id |features          |sfeatures                   |
# +---+------------------+----------------------------+
# |1  |[10.0,10000.0,1.0]|[0.0,0.0,0.0]               |
# |2  |[20.0,30000.0,2.0]|[0.5,0.6666666666666666,0.5]|
# |3  |[30.0,40000.0,3.0]|[1.0,1.0,1.0]               |
# +---+------------------+----------------------------+
Exemple #28
0
labeled_data = msd.map(transform_to_labeled_point)

print 'labeled_data : \n', labeled_data.take(5)

#Part 3
#Choose two features and generate a heat map for each feature on grey scale and shows variation of each feature across 40 sample instances.
#Normalize features between 0 and 1 with 1 representing darkest shade in heat map.
#Experiment with minmaxscaler : https://spark.apache.org/docs/latest/ml-features.html#minmaxscaler

lines = msd.map(lambda line: line.split(','))
msd_transformed = lines.map(lambda line: (line[0], Vectors.dense(line[1:])))
msd_labeled_df = sqlContext.createDataFrame(msd_transformed,
                                            ["label", "features"])
scaler = MinMaxScaler(inputCol="features", outputCol="scaled_features")
# Compute summary statistics and generate MinMaxScalerModel for 40 samples
scaler_model = scaler.fit(msd_labeled_df.limit(40))
# rescale each feature to range [min, max].
scaled_data = scaler_model.transform(msd_labeled_df)

print 'msd_labeled_df : \n', msd_labeled_df.show(2)
print("Features scaled to range: [%f, %f]" %
      (scaler.getMin(), scaler.getMax()))
scaled_data.select("features", "scaled_features").show(2)
scaled_data.show(1, False)

#Select any two features and plot heat map
heatmap1 = np.asarray(
    msd_labeled_df.rdd.map(
        lambda r: (float(r.features[1]), float(r.features[1]))).take(40))
plt.imshow(heatmap1, cmap='gray')
plt.show()
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.linalg import Vectors
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("MinMaxScalerExample")\
        .getOrCreate()

    # $example on$
    dataFrame = spark.createDataFrame([
        (0, Vectors.dense([1.0, 0.1, -1.0]),),
        (1, Vectors.dense([2.0, 1.1, 1.0]),),
        (2, Vectors.dense([3.0, 10.1, 3.0]),)
    ], ["id", "features"])

    scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")

    # Compute summary statistics and generate MinMaxScalerModel
    scalerModel = scaler.fit(dataFrame)

    # rescale each feature to range [min, max].
    scaledData = scalerModel.transform(dataFrame)
    print("Features scaled to range: [%f, %f]" % (scaler.getMin(), scaler.getMax()))
    scaledData.select("features", "scaledFeatures").show()
    # $example off$

    spark.stop()
    for column in list(dataset.columns)
]

pipeline = Pipeline(stages=indexers)
dataset_r = pipeline.fit(dataset).transform(dataset)
columnList = [
    item[0] for item in dataset_r.dtypes if item[1].startswith('double')
]
dataset_numeric = dataset_r.select(columnList)

vecAssembler = VectorAssembler(inputCols=list(dataset_numeric.columns),
                               outputCol="features")
transformed = vecAssembler.transform(dataset_numeric)
scaler = MinMaxScaler(inputCol="features",\
         outputCol="scaledFeatures")
scalerModel = scaler.fit(transformed.select("features"))
df_kmeans = scalerModel.transform(transformed)
df_kmeans.show()
df_kmeans = df_kmeans.select('scaledFeatures')

cost = np.zeros(10)
for k in range(2, 10):
    kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("scaledFeatures")
    model = kmeans.fit(df_kmeans.sample(False, 0.1, seed=42))
    cost[k] = model.computeCost(df_kmeans)

fig, ax = plt.subplots(1, 1, figsize=(8, 6))
ax.plot(range(2, 10), cost[2:10])
ax.set_xlabel('k')
ax.set_ylabel('cost')
plt.show()