Exemple #1
0
data = tf_idf_features_quora(data)
# Get the text features
data = text_features(data)

# combine all the features
feature_assembler = VectorAssembler(
    inputCols=["tf_idf_features", "text_features"],
    outputCol="combined_features")
data = feature_assembler.transform(data)

# Normalizing each feature to have unit standard deviation
scaler = StandardScaler(inputCol="combined_features",
                        outputCol="features",
                        withStd=True,
                        withMean=False)
scalerModel = scaler.fit(data)
# Normalize each feature to have unit standard deviation.
data = scalerModel.transform(data)

# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
label_indexer = StringIndexer(inputCol="label",
                              outputCol="indexedLabel").fit(data)
# Automatically identify categorical features, and index them.
feature_indexer = VectorIndexer(inputCol="features",
                                outputCol="indexedFeatures",
                                maxCategories=2).fit(data)

training_df, test_df = data.randomSplit([0.8, 0.2])
training_df.cache()
test_df.cache()
# See the License for the specific language governing permissions and
# limitations under the License.
#

from __future__ import print_function

from pyspark import SparkContext
from pyspark.sql import SQLContext
# $example on$
from pyspark.ml.feature import StandardScaler
# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="StandardScalerExample")
    sqlContext = SQLContext(sc)

    # $example on$
    dataFrame = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
    scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
                            withStd=True, withMean=False)

    # Compute summary statistics by fitting the StandardScaler
    scalerModel = scaler.fit(dataFrame)

    # Normalize each feature to have unit standard deviation.
    scaledData = scalerModel.transform(dataFrame)
    scaledData.show()
    # $example off$

    sc.stop()
Exemple #3
0
from pyspark.sql.functions import *
from pyspark.ml.linalg import DenseVector

training_dense = training.rdd.map(lambda x: (x[0], DenseVector(x[1:])))
training_dense = spark.createDataFrame(training_dense, ["label", "features"])

test_dense = test.rdd.map(lambda x: (x[0], DenseVector(x[1:])))
test_dense = spark.createDataFrame(test_dense, ["label", "features"])

from pyspark.ml.feature import StandardScaler
standardScaler = StandardScaler(inputCol="features",
                                outputCol="features_scaled",
                                withMean=True)

scaler = standardScaler.fit(training_dense)
scaled_training = scaler.transform(training_dense)
print(scaled_training.head(2))

scaled_test = scaler.transform(test_dense)
print(scaled_test.head(2))

from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib import linalg as mllib_linalg
from pyspark.ml import linalg as ml_linalg


def as_old(v):
    if isinstance(v, ml_linalg.SparseVector):
        return mllib_linalg.SparseVector(v.size, v.indices, v.values)
Exemple #4
0
FeatureData_LocationCoded.toPandas().to_csv('HackDataFeatures_LocationCoded.csv')
FeatureData_LocationCoded.show(5)



# In[3]:

#Scaling the data without Location

from pyspark.ml.feature import StandardScaler

scaler_NoLocation = StandardScaler(inputCol="features_NoLocation", outputCol="scaledFeatures_NoLocation", withStd=True, withMean=False)

# Compute summary statistics by fitting the StandardScaler

scalerModel_NoLocation = scaler_NoLocation.fit(FeatureData_NoLocation)

# Normalize each feature to have unit standard deviation.

FinalData_NoLocation = scalerModel_NoLocation.transform(FeatureData_NoLocation)

FinalData_NoLocation.toPandas().to_csv('HackDataFinal_NoLocation.csv')

###################################################################################################################


#Scaling the data with Location after String Indexer


scaler_LocationIndex = StandardScaler(inputCol="features_LocationIndex", outputCol="scaledFeatures_LocationIndex", withStd=True, withMean=False)
Exemple #5
0
    x for x in c if x != "date" and x != "longitude" and x != "latitude"
    and x != "cumLag" and "lag-" not in x
]

# In[ ]:

from pyspark.ml.feature import VectorAssembler, StandardScaler
assembler = VectorAssembler(inputCols=c, outputCol="features")
scaler = StandardScaler(inputCol="features",
                        outputCol="scaledFeatures",
                        withStd=True,
                        withMean=True)
df1 = assembler.setHandleInvalid("skip").transform(df)
df1.printSchema()
print("df1 count at this point is ", df1.count())
scalarModel = scaler.fit(df1)
df1 = scalarModel.transform(df1)
from pyspark.ml.feature import PCA
pca = PCA(k=40, inputCol="scaledFeatures", outputCol="pcaFeatures")
model = pca.fit(df1)
result = model.transform(df1).select('date', 'latitude', 'longitude',
                                     'pcaFeatures')

# In[ ]:

result = result.coalesce(200)
result.write.parquet(
    "s3a://dse-cohort5-group5/wildfire_capstone/integratedData/completePCA",
    mode="overwrite",
    compression='gzip')
#result =(
#pipeline_sel
#.fit(clustering_df)
#.transform(clustering_df)

#)
#print("ChiSqSelector output with top %d features selected" % selector.getNumTopFeatures())
#result.show()
# standartizavimas
standard_scaler = StandardScaler(
    inputCol="initial_features", 
    outputCol="features", 
    withStd=True, 
    withMean=True)
vectorized_df = vector_assembler.transform(clustering_df)
model_scaler = standard_scaler.fit(vectorized_df)
featurized_clustering_df = model_scaler.transform(vectorized_df)
featurization_pipeline = Pipeline(stages=[vector_assembler, standard_scaler])
featurization_pipeline_model = featurization_pipeline.fit(clustering_df)
model_scaler = featurization_pipeline_model.stages[-1]
featurized_clustering_df = featurization_pipeline_model.transform(clustering_df)
sse_cost = np.zeros(20)
#path_metrics_kmeans_sse = "../data/metrics_kmeans_see.jsonl"
# pradedu klasteriu parinkima metrikas saugau json faile, kartu sukuriau image faila
# kuriame nupiesta kreive kurios pagalba rasime kiek klasteriu reikia
for k in range(2,10):
    kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("features")
    model = kmeans.fit(featurized_clustering_df.sample(False,0.1, seed=42))
    sse_cost[k] = model.computeCost(featurized_clustering_df)
    metrics_row = {"k": k, "sse": sse_cost[k]}
    # metrikas i json
Exemple #7
0
cols = raw_data.columns
cols.remove("Outcome")

#let us import the vector assembler
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=cols, outputCol="features")

#now let us use the transform method
raw_data = assembler.transform(raw_data)
# print(raw_data.select("features").show(truncate=False))

#standard scaler
from pyspark.ml.feature import StandardScaler
standardscaler = StandardScaler().setInputCol("features").setOutputCol(
    "scaled_features")
raw_data = standardscaler.fit(raw_data).transform(raw_data)
# print(raw_data.select("features", "scaled_features").show())

#train test split
train, test = raw_data.randomSplit([0.8, 0.2], seed=12345)

#let us check whether their is imbalance in the dataset
dataset_size = float(train.select("Outcome").count())
numPositives = train.select("Outcome").where('Outcome == 1').count()
per_ones = (float(numPositives) / float(dataset_size)) * 100
numNegatives = float(dataset_size - numPositives)
# print('The number of ones are {}'.format(numPositives))
# print('Percentage of ones are {}'.format(per_ones))

#Imbalance Dataset
# In our dataset (train) we have 34.27 % positives and 65.73 % negatives. Since negatives are in a majority. Therefore,logistic loss objective function should treat the positive class (Outcome == 1) with higher weight. For this purpose we calculate the BalancingRatio as follows:
bucketer.transform(contDF).show()


# COMMAND ----------

from pyspark.ml.feature import QuantileDiscretizer
bucketer = QuantileDiscretizer().setNumBuckets(5).setInputCol("id")
fittedBucketer = bucketer.fit(contDF)
fittedBucketer.transform(contDF).show()


# COMMAND ----------

from pyspark.ml.feature import StandardScaler
sScaler = StandardScaler().setInputCol("features")
sScaler.fit(scaleDF).transform(scaleDF).show()


# COMMAND ----------

from pyspark.ml.feature import MinMaxScaler
minMax = MinMaxScaler().setMin(5).setMax(10).setInputCol("features")
fittedminMax = minMax.fit(scaleDF)
fittedminMax.transform(scaleDF).show()


# COMMAND ----------

from pyspark.ml.feature import MaxAbsScaler
maScaler = MaxAbsScaler().setInputCol("features")
fittedmaScaler = maScaler.fit(scaleDF)
Exemple #9
0
    trained_parameters = load_trained_parameters(trained_parameters, {"input_column": input_column,})

    scaler_model, scaler_model_loaded = load_pyspark_model_from_trained_parameters(
        trained_parameters, MinMaxScalerModel, "scaler_model"
    )

    if scaler_model is None:
        scaler = MinMaxScaler(inputCol=temp_vector_col, outputCol=temp_normalized_vector_col)
        scaler_model = fit_and_save_model(trained_parameters, "scaler_model", scaler, assembled_wo_nans)

    output_df = transform_using_trained_model(scaler_model, assembled, scaler_model_loaded)

    scaler = MaxAbsScaler(inputCol=temp_vector_col, outputCol=temp_normalized_vector_col)

    output_df = scaler.fit(assembled_wo_nans).transform(assembled)

    # convert the resulting vector back to numeric
    temp_flattened_vector_col = temp_col_name(output_df)
    output_df = output_df.withColumn(temp_flattened_vector_col, vector_to_array(temp_normalized_vector_col))

    # keep only the final scaled column.
    output_column = input_column if output_column is None or not output_column else output_column
    output_column_value = sf.col(temp_flattened_vector_col)[0].alias(output_column)
    output_df = output_df.withColumn(output_column, output_column_value)
    final_columns = list(dict.fromkeys((list(df.columns) + [output_column])))
    output_df = output_df.select(final_columns)

    return default_spark_with_trained_parameters(output_df, trained_parameters)

Exemple #10
0
cols = [
    'Session_Connection_Time', 'Bytes_Transferred', 'Kali_Trace_Used',
    'Servers_Corrupted', 'Pages_Corrupted', 'WPM_Typing_Speed'
]

#Assembling The Features
assembler = VectorAssembler(inputCols=cols, outputCol='features')

#Creating the new Dataframe with Features
assembled_data = assembler.transform(data)

#Scaling the Features
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures')

scaler_model = scaler.fit(assembled_data)

scaled_data = scaler_model.transform(assembled_data)

#Creating the Model
k_means = KMeans(featuresCol='scaledFeatures', k=n)

#Training The Model
model = k_means.fit(scaled_data)

#Prediction
model_data = model.transform(scaled_data)

#Grouping and Displaying By Cluster
model_data.groupBy('prediction').count().show()
from __future__ import print_function

from pyspark import SparkContext
from pyspark.sql import SQLContext
# $example on$
from pyspark.ml.feature import StandardScaler
# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="StandardScalerExample")
    sqlContext = SQLContext(sc)

    # $example on$
    dataFrame = sqlContext.read.format("libsvm").load(
        "data/mllib/sample_libsvm_data.txt")
    scaler = StandardScaler(inputCol="features",
                            outputCol="scaledFeatures",
                            withStd=True,
                            withMean=False)

    # Compute summary statistics by fitting the StandardScaler
    scalerModel = scaler.fit(dataFrame)

    # Normalize each feature to have unit standard deviation.
    scaledData = scalerModel.transform(dataFrame)
    scaledData.show()
    # $example off$

    sc.stop()
Exemple #12
0
        pickle.dump(dummy_info,
                    open(os.path.expanduser(dummy_info_path["path"]), 'wb'))
        print("dummy_info saved in:\t" + dummy_info_path["path"])

    # Feature columns
    features_x_name = list(set(usecols_x) - set(Y_name) - set(dummy_columns))
    assembler_x = VectorAssembler(inputCols=features_x_name,
                                  outputCol="features_x_raw")
    data_sdf_i = assembler_x.transform(data_sdf_i)

    # Standardized the non-categorical data.
    scaler = StandardScaler(inputCol="features_x_raw",
                            outputCol="features_x_std",
                            withStd=True,
                            withMean=True)
    scalerModel = scaler.fit(data_sdf_i)
    data_sdf_i = scalerModel.transform(data_sdf_i)

    # Assemble all vectors
    assembler_all = VectorAssembler(
        inputCols=["features_x_std", "features_ONEHOT"], outputCol="features")
    data_sdf_i = assembler_all.transform(data_sdf_i)

    # Model specification
    lr = SLogisticRegression(
        labelCol=Y_name,
        featuresCol="features",
        fitIntercept=
        fit_intercept,  # Already standardized with non-dummy columns
        standardization=False
    )  # , maxIter=100, regParam=0.3, elasticNetParam=0.8)
Exemple #13
0
    training_random
    .withColumn(
        "Weight",F.when(F.col("Class") == 0, 1.0)
          .otherwise(10.0)   #Class==1   10 
    )
)
test_weighted=test_random
 

#=======Justify the choice of sampling method  focusing on  logistic regression
 
 
#logistic regression
#choose the proper K for PCA   #using for training data only   eg training_random
standard_scaler = StandardScaler(inputCol="raw_Features", outputCol="scaled_features")
standard_fit = standard_scaler.fit(training_random)
standard_train=standard_fit.transform(training_random)
pca = PCA(k=10, inputCol="scaled_features", outputCol="pca_features") 
model_pca = pca.fit(standard_train)
#model_pca = pca.fit(standard_train) 
#model_pca.explainedVariance   #Returns a vector of proportions of variance explained by each principal component.
tt=[round(num,3) for num in model_pca.explainedVariance]
print(tt) 
#[0.422, 0.306, 0.149, 0.071, 0.024, 0.015, 0.007, 0.004, 0.002, 0.0]
plt.figure()
plt.plot(range(1,11),model_pca.explainedVariance)
plt.xlabel('Number of Components')
plt.ylabel('Variance (%)') 
plt.title('Explained Variance')
plt.savefig('PCA.png')
 
Exemple #14
0
        print "Loading RAW data..."
        raw_data = sc.textFile(data_file)

        print "Parsing dataset..."
        parsed_labelpoint = raw_data.map(kup.parse_multiClass)
        parsed_labelpoint_df = spark.createDataFrame(parsed_labelpoint,
                                                     ["label", "features"])

        print "Standardizing data..."
        scaler = StandardScaler(inputCol="features",
                                outputCol="scaledFeatures",
                                withStd=True,
                                withMean=True)

        # train a scaler to perform feature scaling
        scalerModel = scaler.fit(parsed_labelpoint_df)
        shutil.rmtree(scalerPath, ignore_errors=True)
        scalerModel.save(scalerPath)

        # Normalize each feature to have unit standard deviation.
        train_df_tmp = scalerModel.transform(parsed_labelpoint_df)
        train_df = train_df_tmp.drop("features").withColumnRenamed(
            "scaledFeatures", "features")

        # show the frequency of each label
        tmp_df = train_df.groupBy("label").count()
        tmp_df.show(10)

        lr = LogisticRegression(maxIter=10, regParam=0.01, elasticNetParam=0.2)

        # instantiate the One Vs Rest Classifier.
Exemple #15
0
  percentile(t_tempo, 0.5) a_tempo
from track, track_artists ta, artist
where ta.track_id = track.t_id and artist.a_id = ta.artist_id
group by a_id
    """.format(args.agg_table_name))

    agg_table = spark.table(args.agg_table_name)

    existing_tables = [table.name for table in spark.catalog.listTables()]
    # K-means on artist features
    if args.feature_kmeans_table_name not in existing_tables:
        # normalize features
        va = VectorAssembler(inputCols=[column for column in agg_table.columns if column != "a_id"], outputCol="raw_features")
        feature_table = va.transform(agg_table)
        standard_scaler = StandardScaler(inputCol="raw_features", outputCol="features")
        feature_table = standard_scaler.fit(feature_table).transform(feature_table).select("a_id", "raw_features", "features")
        feature_table.show()

        # k-means
        kmeans = KMeans(k=100)
        model = kmeans.fit(feature_table)
        clustered = model.transform(feature_table).select("a_id", "prediction")
        #clustered.show()
        clustered.write.saveAsTable(args.feature_kmeans_table_name, format="orc", mode="error")

    if args.smoothed_kmeans_table_name not in existing_tables:
        # Compute artist collaboration graph as edge list with self-loop
        collaboration = spark.sql("select a.artist_id node, b.artist_id neighbor from track_artists a, track_artists b where a.track_id = b.track_id") # and a.artist_id != b.artist_id
        collaboration.registerTempTable("collaboration")
        # Smooth the features of artists by averaging over their neighbors. For artist with no collaborator, its features should remain unchanged.
        artist_features = spark.sql("""select node, avg(am.a_track_number) track_number, avg(am.a_mode) modality, avg(am.a_acousticness) acousticness, avg(am.a_danceability) danceability, avg(am.a_energy) energy,
def rescale_df(data):
    """Rescale the data."""
    standardScaler = StandardScaler(inputCol="features", outputCol="features_scaled")
    scaler = standardScaler.fit(data)
    scaled_df = scaler.transform(data)
    return scaled_df
Exemple #17
0
# Save the query output into a bucket
query.write.save(sys.argv[2] + '/Query_Iris_' + str(date), format="json")

# ML
# Pre-process the data
assembler = VectorAssembler(
    inputCols=['sepal_length', 'sepal_width','petal_length','petal_width'],
    outputCol="raw_features")
vector_df = assembler.transform(df)

# Scale features to have zero mean and unit standard deviation
standarizer = StandardScaler(withMean=True, withStd=True,
                              inputCol='raw_features',
                              outputCol='features')
model = standarizer.fit(vector_df)
vector_df = model.transform(vector_df)

# Convert label to number
indexer = StringIndexer(inputCol="variety", outputCol="label")
indexed = indexer.fit(vector_df).transform(vector_df)
indexed.show(10)

# Select features
iris = indexed.select(['features', 'label'])

# LR
train, test = iris.randomSplit([0.7, 0.3])
lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10)
LRmodel = lr.fit(train)
# In[26]:

#scaler = Normalizer(inputCol="TitleAndBodyLengthVector", outputCol="ScaledNumFeatures")
#df = scaler.transform(df)

df.select(["id", "ScaledNumFeatures"]).where(df.Id == "512").collect()

# # Question 4
# Using the StandardScaler method (scaling both the mean and the standard deviation) what's the normalized value for question Id = 512?

# In[27]:

scaler2 = StandardScaler(inputCol="TitleAndBodyLengthVector",
                         outputCol="ScaledNumFeatures2",
                         withStd=True)
scalerModel = scaler2.fit(df)
df = scalerModel.transform(df)
df.select(["id", "ScaledNumFeatures2"]).where(df.Id == "512").collect()

# # Question 5
# Using the MinMAxScaler method what's the normalized value for question Id = 512?

# In[29]:

from pyspark.ml.feature import MinMaxScaler
scaler3 = MinMaxScaler(inputCol="TitleAndBodyLengthVector",
                       outputCol="ScaledNumFeatures3")
scalerModel3 = scaler3.fit(df)
df = scalerModel3.transform(df)

df.select(["id", "ScaledNumFeatures3"]).where(df.Id == "512").collect()
Exemple #19
0
|[10.0,139.0,80.0,29.153419593345657,155.5482233502538,27.1,1.441,57.0]             |
|[1.0,189.0,60.0,23.0,846.0,30.1,0.398,59.0]                                        |
|[5.0,166.0,72.0,19.0,175.0,25.8,0.587,51.0]                                        |
|[7.0,100.0,72.40518417462484,29.153419593345657,155.5482233502538,30.0,0.484,32.0] |
|[0.0,118.0,84.0,47.0,230.0,45.8,0.551,31.0]                                        |
|[7.0,107.0,74.0,29.153419593345657,155.5482233502538,29.6,0.254,31.0]              |
|[1.0,103.0,30.0,38.0,83.0,43.3,0.183,33.0]                                         |
|[1.0,115.0,70.0,30.0,96.0,34.6,0.529,32.0]                                         |
+-----------------------------------------------------------------------------------+
'''
##################################################################################
# StandardScaler to scalerize the newly created "feature" column
##################################################################################
standardScalar = StandardScaler().setInputCol("features").setOutputCol(
    "Scaled_features")
raw_data = standardScalar.fit(raw_data).transform(raw_data)
raw_data.select("features", "Scaled_features").show(5, truncate=False)
'''
+---------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------+
|[6.0,148.0,72.0,35.0,155.5482233502538,33.6,0.627,50.0]              |[1.7806383732194306,4.862670805688543,5.952210601826984,3.9813708583353558,1.8295247783934943,4.887165154544966,1.8923811872495484,4.251616970894646] |
|[1.0,85.0,66.0,29.0,155.5482233502538,26.6,0.351,31.0]               |[0.29677306220323846,2.7927501248886903,5.456193051674735,3.29885013976358,1.8295247783934943,3.869005747348098,1.0593712866420917,2.6360025219546803]|
|[8.0,183.0,64.0,29.153419593345657,155.5482233502538,23.3,0.672,32.0]|[2.3741844976259077,6.0126267394662385,5.290853868290652,3.316302148279125,1.8295247783934943,3.3890163125267176,2.0281980188703295,2.721034861372573]|
|[1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0]                            |[0.29677306220323846,2.9241736601775696,5.456193051674735,2.616329421191805,1.1056078010080843,4.087182763175998,0.5040313529037872,1.785679127775751]|
|[0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0]                          |[0.0,4.501256083644124,3.3067836676816578,3.9813708583353558,1.975979899674023,6.268952921455001,6.905531349963264,2.806067200790466]                 |
+---------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------+
'''
#Split data into Training and Test
train, test = raw_data.randomSplit([0.8, 0.2], seed=12345)

#Let us check whether their is imbalance in the dataset
dataset_size = float(train.select("Outcome").count())
def run_standard_scaler(t_data):
    standardscaler = StandardScaler().setInputCol("features").setOutputCol(
        "scaled_features")
    t_data = standardscaler.fit(t_data).transform(t_data)

    return t_data
Exemple #21
0
# In[46]:


#Scaling Data prior to SMOTE


# In[47]:


scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=True)


# In[48]:


scalerModel = scaler.fit(df_baseline)


# In[49]:


scaledData = scalerModel.transform(df_baseline)


# In[50]:


scaledData = scaledData.drop("features")


# In[51]:
Exemple #22
0
train, test = df_transformed.randomSplit(
    [float(sys.argv[1]), float(sys.argv[2])], seed=7)
#train60,test40 = df_transformed.randomSplit([0.6,0.4],seed=7)
#train70,test30 = df_transformed.randomSplit([0.7, 0.3], seed=7)
#train80,test20 = df_transformed.randomSplit([0.8,0.2],seed=7)
#train90,test10 = df_transformed.randomSplit([0.9,0.1],seed=7)

logger.error("#### after split")
logger.error("####  standardscaler on train dataset")

standardizer = StandardScaler(withMean=True,
                              withStd=True,
                              inputCol='features',
                              outputCol='std_features')
standardizer_model = standardizer.fit(train)
standardized_features_df70 = standardizer_model.transform(train)

logger.error("#### standardscaler on test dataset")

standardizer = StandardScaler(withMean=True,
                              withStd=True,
                              inputCol='features',
                              outputCol='std_features')
standardizer_model = standardizer.fit(test)
standardized_features_df30 = standardizer_model.transform(test)

from pyspark.ml.feature import VectorAssembler, StandardScaler, PCA

logger.error("######  pca on standarded scaler using train")
pca = PCA(k=2, inputCol="std_features", outputCol="pca_features")
Exemple #23
0
# scale the data : because all are used to calculate the distance ,they are should be in the same scale

workdingDF.columns

# not use rowID (result is be stored) 
# max_wind_speed has a high correlation with the wind* ,not incude them either
featureColumns = ['air_pressure','air_temp','avg_wind_direction','avg_wind_speed','max_wind_direction','max_wind_speed','relative_humidity']
assembler = VectorAssembler(inputCols=featureColumns,outputCol='features_unscaled')
assembled = assembler.transform(workdingDF)

# scale
# (each column - mean /std) =====mean = 0 
scaler = StandardScaler(inputCol='features_unscaled',outputCol='features',withStd=True,withMean=True)

scaleModel = scaler.fit(assembled)
scaleData = ScaleModel.transform(assembled)

#(X-mean)/std  计算时对每个属性/每列分别进行。
'''
将数据按期属性(按列进行)减去其均值,并处以其方差。得到的结果是,对于每个属性/每列来说所有数据都聚集在0附近,方差为1。
'''

# create elbow plot to see the number of centers
#This method involves applying k-means, using different values for k, and calculating the within-cluster sum-of-squared error (WSSE). Since this means applying k-means multiple times, this process can be very compute-intensive. To speed up the process, we will use only a subset of the dataset. We will take every third sample from the dataset to create this subset:

scaleData = ScaleData.select('features','rowID')
elbowset = scale.filter((scaleData.rowID % 3 == 0)).select('features')
elbowset.persist()
#The last line calls the persist() method to tell Spark to keep the data in memory (if possible), which will speed up the computations.
clusters = range(2,31)
Exemple #24
0
def main(input, model_file):
    # Defining the schema for Lift1 datasets
    def sensor_schema():
        sen_schema = types.StructType([
            types.StructField('timestamp', types.StringType()),
            types.StructField('X', types.DoubleType()),
            types.StructField('Y', types.DoubleType()),
            types.StructField('Z', types.DoubleType()),
        ])
        return sen_schema

    def calc_score(count, min_max_collection):
        min_count = float(min_max_collection[0])
        max_count = float(min_max_collection[1])
        score = (max_count - count) / (max_count - min_count)
        return score

    sens_schema = sensor_schema()
    #Spark read of data
    temp = spark.read.csv(input, schema=sens_schema)

    # Selecting time range from 07/09 to 08/09 . Other data are useless
    temp.createOrReplaceTempView("temp")
    temp = spark.sql(
        "select timestamp,Z from temp where timestamp between '2018-07-09 12:00:00' and '2018-08-09 12:00:00'"
    )

    # The below code is to apply a standard scale to achieve Z normalization. This will ensure to make mean as 0 and standard deviation as 1.
    # UDF for converting column type from vector to double type
    unlist = udf(lambda x: round(float(list(x)[0]), 6), types.DoubleType())

    assembler = VectorAssembler(inputCols=["Z"], outputCol="Zvector")
    tempdata = assembler.transform(temp)
    scaler = StandardScaler(inputCol="Zvector",
                            outputCol="Zscale",
                            withMean=True,
                            withStd=True)
    scalerModel = scaler.fit(tempdata)
    scaledData = scalerModel.transform(tempdata).withColumn(
        "Zscale", unlist("Zscale")).drop("Zvector").cache()
    scaledData.show()

    #Conversion of timestamp string to timestamp type. This is for smoothing purpose
    scaledData = scaledData.withColumn(
        "times", to_timestamp("timestamp", 'yyyy-MM-dd HH:mm:ss')).cache()

    #Obtain moving averages
    movAvg = scaledData.withColumn(
        "movingAverage",
        avg(scaledData["Zscale"]).over(
            Window.partitionBy(scaledData["times"]).rowsBetween(-3,
                                                                3))).cache()
    movAvg.show()

    #Round the Zscale value to 0
    movAvg.createOrReplaceTempView("movAvg")
    scaledNorm = spark.sql(
        "select times,Zscale,round(movingAverage,0) as Zround from movAvg"
    ).cache()
    scaledNorm.show()

    #Feature transform for K means
    cols = ["Zscale", "Zround"]
    ft_assembler = VectorAssembler(inputCols=cols, outputCol="features")
    in_df = ft_assembler.transform(scaledNorm)
    kmeans = KMeans().setK(5).setSeed(1)
    model = kmeans.fit(in_df)

    # Make predictions
    predict = model.transform(in_df).cache()
    predict.show()

    #Evaluate clustering by computing Silhouette score
    evaluator = ClusteringEvaluator()

    silhouette = evaluator.evaluate(predict)
    print("Silhouette with squared euclidean distance = " + str(silhouette))

    #Shows the result
    centers = model.clusterCenters()
    print("Cluster Centers: ")
    for center in centers:
        print(center)

    #Saving the model
    model.write().overwrite().save(model_file)

    #Calculate the total count of each cluster
    count_df = predict.groupBy("prediction").count().cache()
    count_df.show()

    #count_df.createOrReplaceTempView("count_df")
    #min_max_list = spark.sql("select min(count) as min,max(count) as max from count_df group by count").collect()[0]
    min_max_list = count_df.agg(min('count'), max('count')).collect()[0]
    print(min_max_list)

    #Calculating the scores
    udf_calc_score = udf(lambda count: calc_score(float(count), min_max_list),
                         types.FloatType())
    anom_score = count_df.withColumn("score", udf_calc_score("count")).cache()
    anom_score.show()

    #Populating scores
    predict = predict.join(anom_score,
                           "prediction").select("times", "Zscale", "Zround",
                                                "prediction", "score")
    predict.show()

    #Anomaly detection based om threshold
    anomaly = predict.where(predict["score"] > 0.9999)
    anomaly.show()

    #Writing to a csv file
    anomaly.coalesce(1).orderBy("times").write.csv("kmeansout")
    df = df.withColumn("THD18th", functions.col("THD18th") * 100) \
        .withColumn("THD42th", functions.col("THD42th") * 100)   # Convert to percentage values, 注意换行符后面不能有空格,否则报错!

    df = df.select("Hours",
                   "THD18th")  # Select 'Hours' & 'THD18th' for analysis
    df.show(24)

    input_data = df.rdd.map(
        lambda x:
        (x[0], DenseVector(x[1:])))  # Create new dataframe with lables
    labeled_df = sqlContext.createDataFrame(input_data, ["label", "features"])
    labeled_df.show(24)

    standardScaler = StandardScaler(inputCol="features",
                                    outputCol="features_scaled")  # Re-scaling
    scaler = standardScaler.fit(labeled_df)
    scaled_df = scaler.transform(labeled_df)
    scaled_df.show(24)

    train_data, test_data = scaled_df.randomSplit(
        [0.7, 0.3])  # Randomly choose 30% as test data
    test_data.show(24)

    #lr = LinearRegression(labelCol="label", maxIter=10, regParam=0.3, elasticNetParam=0.8)   # Train models
    lr = GeneralizedLinearRegression(family="gaussian",
                                     link="identity",
                                     maxIter=10,
                                     regParam=0.3)
    linearModel = lr.fit(train_data)

    predicted = linearModel.transform(test_data)  # Prediction
def transform(spark, s3_input_data, s3_output_train_data,
              s3_output_validation_data, s3_output_test_data):
    print('Processing {} => {}'.format(s3_input_data, s3_output_train_data,
                                       s3_output_validation_data,
                                       s3_output_test_data))

    schema = StructType([
        StructField('marketplace', StringType(), True),
        StructField('customer_id', StringType(), True),
        StructField('review_id', StringType(), True),
        StructField('product_id', StringType(), True),
        StructField('product_parent', StringType(), True),
        StructField('product_title', StringType(), True),
        StructField('product_category', StringType(), True),
        StructField('star_rating', IntegerType(), True),
        StructField('helpful_votes', IntegerType(), True),
        StructField('total_votes', IntegerType(), True),
        StructField('vine', StringType(), True),
        StructField('verified_purchase', StringType(), True),
        StructField('review_headline', StringType(), True),
        StructField('review_body', StringType(), True),
        StructField('review_date', StringType(), True)
    ])

    df_csv = spark.read.csv(path=s3_input_data,
                            sep='\t',
                            schema=schema,
                            header=True,
                            quote=None)
    df_csv.show()

    # This dataset should already be clean, but always good to double-check
    print('Showing null review_body rows...')
    df_csv.where(col('review_body').isNull()).show()

    df_csv_cleaned = df_csv.na.drop(subset=['review_body'])
    df_csv_cleaned.where(col('review_body').isNull()).show()

    tokenizer = Tokenizer(inputCol='review_body', outputCol='words')
    wordsData = tokenizer.transform(df_csv_cleaned)

    hashingTF = HashingTF(inputCol='words',
                          outputCol='raw_features',
                          numFeatures=1000)
    featurizedData = hashingTF.transform(wordsData)

    # While applying HashingTF only needs a single pass to the data, applying IDF needs two passes:
    # 1) compute the IDF vector
    # 2) scale the term frequencies by IDF
    # Therefore, we cache the result of the HashingTF transformation above to speed up the 2nd pass
    featurizedData.cache()

    # spark.mllib's IDF implementation provides an option for ignoring terms
    # which occur in less than a minimum number of documents.
    # In such cases, the IDF for these terms is set to 0.
    # This feature can be used by passing the minDocFreq value to the IDF constructor.
    idf = IDF(inputCol='raw_features', outputCol='features')  #, minDocFreq=2)
    idfModel = idf.fit(featurizedData)
    features_df = idfModel.transform(featurizedData)
    features_df.select('star_rating', 'features').show()

    num_features = 300
    pca = PCA(k=num_features, inputCol='features', outputCol='pca_features')
    pca_model = pca.fit(features_df)
    pca_features_df = pca_model.transform(features_df).select(
        'star_rating', 'pca_features')
    pca_features_df.show(truncate=False)

    standard_scaler = StandardScaler(inputCol='pca_features',
                                     outputCol='scaled_pca_features')
    standard_scaler_model = standard_scaler.fit(pca_features_df)
    standard_scaler_features_df = standard_scaler_model.transform(
        pca_features_df).select('star_rating', 'scaled_pca_features')
    standard_scaler_features_df.show(truncate=False)

    expanded_features_df = (standard_scaler_features_df.withColumn(
        'f', to_array(col('scaled_pca_features'))).select(
            ['star_rating'] + [col('f')[i] for i in range(num_features)]))
    expanded_features_df.show()

    train_df, validation_df, test_df = expanded_features_df.randomSplit(
        [0.9, 0.05, 0.05])

    train_df.write.csv(path=s3_output_train_data, header=None, quote=None)
    print('Wrote to output file:  {}'.format(s3_output_train_data))

    validation_df.write.csv(path=s3_output_validation_data,
                            header=None,
                            quote=None)
    print('Wrote to output file:  {}'.format(s3_output_validation_data))

    test_df.write.csv(path=s3_output_test_data, header=None, quote=None)
    print('Wrote to output file:  {}'.format(s3_output_test_data))
Exemple #27
0
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.regression import LinearRegression

# Pull our data into a Spark dataframe
df = spark.sql("select * from sensor_readings")

# Extract the columns that we want in our feature vector
featureColumns = df.drop("timestamp","Sensor-Predict").columns

# First we will use `VectorAssembler` to combine all feature columns into a feature vector (optimized data structure for ML)
assembler = VectorAssembler(inputCols=featureColumns, outputCol="featureVector")
dfVector = assembler.transform(df)

# Then we will scale the values of each sensor to have a standard mean and deviation
scaler = StandardScaler(inputCol="featureVector", outputCol="features", withStd=True, withMean=False)
dfScaled = scaler.fit(dfVector).transform(dfVector)

display(dfScaled.select("features","Sensor-Predict"))

# COMMAND ----------

# MAGIC %md
# MAGIC ## Model Training
# MAGIC With our scaled and vectorized feature set, we can now train a linear regression model against the data.
# MAGIC 
# MAGIC Databricks can also visualize model residuals, as well as ROC curves and decision trees.

# COMMAND ----------

# Split the data into a training and test dataset
(trainingData, testingData) = dfScaled.randomSplit([0.7, 0.3])
hashingTF = HashingTF(numFeatures=285,
                      inputCol='concat_(stop_words, com_skips)',
                      outputCol='features')
tf1 = hashingTF.transform(df_all_words1)

# Normalize the counts so that they are a percentage of total counts of the features

tf_norm1 = Normalizer(inputCol="features", outputCol="features_norm",
                      p=1).transform(tf1)

# Standardize the vector based on average use of each feature among all users
stdscaler = StandardScaler(inputCol='features_norm',
                           outputCol='scaled',
                           withMean=True)
scale_fit1 = stdscaler.fit(tf_norm1)
scaled1 = scale_fit1.transform(tf_norm1)

# Do all of the above for subset #2

comments2 = df2.groupBy("author").agg(F.collect_list("body"))
join_comments_udf = udf(lambda x: ' '.join(x), StringType())
df2_join_comments = comments2.withColumn(
    'corpus', join_comments_udf(comments2['collect_list(body)']))

df_count_links2 = df2_join_comments.withColumn(
    'link_count', count_links_udf(df2_join_comments['corpus']))

df_drop_links2 = df_count_links2.withColumn(
    'corpus', drop_links_udf(df_count_links2['corpus']))
Exemple #29
0
    def initialize(self, do_scaling=True, do_onehot=True):
        """Reads the dataset, initializes class members.

    features_df: Original DataFrame as read from the features_file.
    train_df: A DataFrame with columns Lat, Lon, Pickup_Count and
        vector columns Features & ScaledFeatures. Contains only data before 2015.
    test_df: As train_df, but only containing data of 2015.
    districts_with_counts: A DataFrame with all districts and their counts.
    """

        # Read feature dataframe
        self.features_df = self.sql_context.read.parquet(
            self.features_file).cache()

        # Set exclude columns to default
        exclude_columns = self.EXCLUDE_COLUMNS

        # Scale features
        if do_scaling:
            assembler = VectorAssembler(inputCols=self.SCALE_COLUMNS,
                                        outputCol='FeaturesToScale')
            self.features_df = assembler.transform(self.features_df)
            scaler = StandardScaler(inputCol='FeaturesToScale',
                                    outputCol=('ScaledFeatures'),
                                    withStd=True,
                                    withMean=False)
            self.features_df = scaler.fit(self.features_df).transform(
                self.features_df)

            exclude_columns += self.SCALE_COLUMNS + ['FeaturesToScale']

        # Adopt categorical features that do not have a value range of [0, numCategories)
        for column in ['Day', 'Month', 'Day_Of_Year']:
            if column in self.features_df.columns:
                self.features_df = self.features_df.withColumn(
                    column, self.features_df[column] - 1)

        # Encode categorical features using one-hot encoding
        if do_onehot:
            vec_category_columns = [
                '%s_Vector' % column for column in self.ONE_HOT_COLUMNS
            ]
            for i in range(len(self.ONE_HOT_COLUMNS)):
                column = self.ONE_HOT_COLUMNS[i]
                if column in self.features_df.columns:
                    self.features_df = self.features_df.withColumn(
                        column, self.features_df[column].cast(DoubleType()))
                    encoder = OneHotEncoder(inputCol=column,
                                            outputCol=vec_category_columns[i],
                                            dropLast=False)
                    self.features_df = encoder.transform(self.features_df)
            exclude_columns += self.ONE_HOT_COLUMNS

        # Vectorize features
        feature_columns = [
            column for column in self.features_df.columns
            if column not in exclude_columns
        ]
        assembler = VectorAssembler(inputCols=feature_columns,
                                    outputCol='Features')
        self.features_df = assembler.transform(self.features_df)

        # Set number of distinct values for categorical features (identified by index)
        self.categorical_features_info = {}
        if not do_onehot:
            self.categorical_features_info = {
                i: self.CATEGORY_VALUES_COUNT[feature_columns[i]]
                for i in range(len(feature_columns))
                if feature_columns[i] in self.CATEGORY_VALUES_COUNT.keys()
            }

        # Split into train and test data
        split_date = datetime(2015, 1, 1)
        self.train_df = self.features_df.filter(
            self.features_df.Time < split_date).cache()
        self.test_df = self.features_df.filter(
            self.features_df.Time > split_date).cache()

        # Compute Districts with counts
        self.districts_with_counts = self.features_df \
                                     .groupBy([self.features_df.Lat, self.features_df.Lon]) \
                                     .count()
Exemple #30
0
    outputCol="DenseVector")

train_df = vectorAssembler.transform(train_df)
'''
Done to standardise data 
'''

stand_scaled = StandardScaler(inputCol="DenseVector",
                              outputCol="features",
                              withStd=True,
                              withMean=True)
'''
outputCol must be named Features as Spark KMeans will only use that column as input
'''

scaled_model = stand_scaled.fit(train_df)

train_df = scaled_model.transform(train_df)

bkmeans = BisectingKMeans().setK(2)
bkmeans = bkmeans.setSeed(1)
bkmodel = bkmeans.fit(train_df)
bkcenters = bkmodel.clusterCenters()

if bkmodel.hasSummary:
    print(bkmodel.summary.clusterSizes)
    print(bkmodel.clusterCenters())

predict_df = bkmodel.transform(train_df)

predict_df = predict_df.select("avgMeasuredTime", "avgSpeed", "vehicleCount",
Exemple #31
0
from pyspark.sql import SparkSession
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

spark = SparkSession.builder.appName("clustering").getOrCreate()
df = spark.read.csv("./files/seeds_dataset.csv", inferSchema=True, header=True)

# df.show()

assembler = VectorAssembler(inputCols=df.columns, outputCol='features')

data = assembler.transform(df)
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures')
scaled_data = scaler.fit(data).transform(data)
kmeans = KMeans(featuresCol='scaledFeatures').setK(3)
model = kmeans.fit(scaled_data)

# print("WSSSE")
# print(model.computeCost(scaled_data))
print(model.clusterCenters())

model.transform(scaled_data).select('prediction').show()
# COMMAND ----------

display(output)

# COMMAND ----------

# MAGIC %md
# MAGIC ####Scaling the Data

# COMMAND ----------

scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
                        withStd=True, withMean=True)

# Compute summary statistics by fitting the StandardScaler
scalerModel = scaler.fit(output)

# Normalize each feature to have unit standard deviation.
scaledData = scalerModel.transform(output)

# COMMAND ----------

display(scaledData)

# COMMAND ----------

# MAGIC %md
# MAGIC ####Principle Component Analysis

# COMMAND ----------
Exemple #33
0
transformed = pipeline_model.transform(freqItemsets)
new_transformed = extractTopics(transformed)
dislpay(new_transformed)

# COMMAND ----------

#PCA
#poco significativa la visualizzazione 
from pyspark.ml.feature import PCA 
from pyspark.ml.feature import StandardScaler
#normalizzazione prima della PCA
counts = CountVectorizer(inputCol = "items", outputCol="raw_features2", vocabSize = 10000, minDF = 2.0)
counter = counts.fit(new_transformed)
counted = counter.transform(new_transformed)
scaler = StandardScaler(inputCol="raw_features2", outputCol="scaledFeatures",withStd=True, withMean=False)
scalerModel = scaler.fit(counted)
scaledData = scalerModel.transform(counted)

pca = PCA(k=2, inputCol = "raw_features2", outputCol = "pca")
model = pca.fit(scaledData)
transformed_df = model.transform(scaledData)
display(transformed_df)

# COMMAND ----------

#la PCA è inefficace per la riduzione della dimensionalità!
#quindi se il clustering non viene visualizzato bene, è normale
#d'altra parte non possiamo giocare su altro per migliorare la PCA
#il punto è che abbiamo dati ad alta dimensionalità, con vettori molto sparsi e non possiamo scegliere le componenti 
model.explainedVariance
Exemple #34
0
data = tf_idf_features_quora(data)
# Get the text features
data = text_features(data)

# combine all the features
feature_assembler = VectorAssembler(
    inputCols=["tf_idf_features", "text_features"],
    outputCol="combined_features"
)
data = feature_assembler.transform(data)


# Normalizing each feature to have unit standard deviation
scaler = StandardScaler(inputCol="combined_features", outputCol="features",
                        withStd=True, withMean=False)
scalerModel = scaler.fit(data)
# Normalize each feature to have unit standard deviation.
data = scalerModel.transform(data)


# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
label_indexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data)
# Automatically identify categorical features, and index them.
feature_indexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=2).fit(data)

training_df, test_df = data.randomSplit([0.8, 0.2])
training_df.cache()
test_df.cache()

Exemple #35
0
  def initialize(self, do_scaling=True, do_onehot=True):
    """Reads the dataset, initializes class members.

    features_df: Original DataFrame as read from the features_file.
    train_df: A DataFrame with columns Lat, Lon, Pickup_Count and
        vector columns Features & ScaledFeatures. Contains only data before 2015.
    test_df: As train_df, but only containing data of 2015.
    districts_with_counts: A DataFrame with all districts and their counts.
    """

    # Read feature dataframe
    self.features_df = self.sql_context.read.parquet(self.features_file).cache()

    # Set exclude columns to default
    exclude_columns = self.EXCLUDE_COLUMNS

    # Scale features
    if do_scaling:
      assembler = VectorAssembler(inputCols=self.SCALE_COLUMNS,
                                  outputCol='FeaturesToScale')
      self.features_df = assembler.transform(self.features_df)
      scaler = StandardScaler(inputCol='FeaturesToScale',
                              outputCol=('ScaledFeatures'),
                              withStd=True, withMean=False)
      self.features_df = scaler.fit(self.features_df).transform(self.features_df)

      exclude_columns += self.SCALE_COLUMNS + ['FeaturesToScale']

    # Adopt categorical features that do not have a value range of [0, numCategories)
    for column in ['Day', 'Month', 'Day_Of_Year']:
        if column in self.features_df.columns:
            self.features_df = self.features_df.withColumn(column, self.features_df[column] - 1)

    # Encode categorical features using one-hot encoding
    if do_onehot:
      vec_category_columns = ['%s_Vector' % column for column in self.ONE_HOT_COLUMNS]
      for i in range(len(self.ONE_HOT_COLUMNS)):
        column = self.ONE_HOT_COLUMNS[i]
        if column in self.features_df.columns:
            self.features_df = self.features_df.withColumn(column, self.features_df[column].cast(DoubleType()))
            encoder = OneHotEncoder(inputCol=column,
                                    outputCol=vec_category_columns[i],
                                    dropLast=False)
            self.features_df = encoder.transform(self.features_df)
      exclude_columns += self.ONE_HOT_COLUMNS

    # Vectorize features
    feature_columns = [column for column in self.features_df.columns
                              if column not in exclude_columns]
    assembler = VectorAssembler(inputCols=feature_columns, outputCol='Features')
    self.features_df = assembler.transform(self.features_df)

    # Set number of distinct values for categorical features (identified by index)
    self.categorical_features_info = {}
    if not do_onehot:
        self.categorical_features_info = {i:self.CATEGORY_VALUES_COUNT[feature_columns[i]]
                                          for i in range(len(feature_columns))
                                          if feature_columns[i] in self.CATEGORY_VALUES_COUNT.keys()}

    # Split into train and test data
    split_date = datetime(2015, 1, 1)
    self.train_df = self.features_df.filter(self.features_df.Time < split_date).cache()
    self.test_df = self.features_df.filter(self.features_df.Time > split_date).cache()

    # Compute Districts with counts
    self.districts_with_counts = self.features_df \
                                 .groupBy([self.features_df.Lat, self.features_df.Lon]) \
                                 .count()