data = tf_idf_features_quora(data) # Get the text features data = text_features(data) # combine all the features feature_assembler = VectorAssembler( inputCols=["tf_idf_features", "text_features"], outputCol="combined_features") data = feature_assembler.transform(data) # Normalizing each feature to have unit standard deviation scaler = StandardScaler(inputCol="combined_features", outputCol="features", withStd=True, withMean=False) scalerModel = scaler.fit(data) # Normalize each feature to have unit standard deviation. data = scalerModel.transform(data) # Index labels, adding metadata to the label column. # Fit on whole dataset to include all labels in index. label_indexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data) # Automatically identify categorical features, and index them. feature_indexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=2).fit(data) training_df, test_df = data.randomSplit([0.8, 0.2]) training_df.cache() test_df.cache()
# See the License for the specific language governing permissions and # limitations under the License. # from __future__ import print_function from pyspark import SparkContext from pyspark.sql import SQLContext # $example on$ from pyspark.ml.feature import StandardScaler # $example off$ if __name__ == "__main__": sc = SparkContext(appName="StandardScalerExample") sqlContext = SQLContext(sc) # $example on$ dataFrame = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=False) # Compute summary statistics by fitting the StandardScaler scalerModel = scaler.fit(dataFrame) # Normalize each feature to have unit standard deviation. scaledData = scalerModel.transform(dataFrame) scaledData.show() # $example off$ sc.stop()
from pyspark.sql.functions import * from pyspark.ml.linalg import DenseVector training_dense = training.rdd.map(lambda x: (x[0], DenseVector(x[1:]))) training_dense = spark.createDataFrame(training_dense, ["label", "features"]) test_dense = test.rdd.map(lambda x: (x[0], DenseVector(x[1:]))) test_dense = spark.createDataFrame(test_dense, ["label", "features"]) from pyspark.ml.feature import StandardScaler standardScaler = StandardScaler(inputCol="features", outputCol="features_scaled", withMean=True) scaler = standardScaler.fit(training_dense) scaled_training = scaler.transform(training_dense) print(scaled_training.head(2)) scaled_test = scaler.transform(test_dense) print(scaled_test.head(2)) from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel from pyspark.mllib.regression import LabeledPoint from pyspark.mllib import linalg as mllib_linalg from pyspark.ml import linalg as ml_linalg def as_old(v): if isinstance(v, ml_linalg.SparseVector): return mllib_linalg.SparseVector(v.size, v.indices, v.values)
FeatureData_LocationCoded.toPandas().to_csv('HackDataFeatures_LocationCoded.csv') FeatureData_LocationCoded.show(5) # In[3]: #Scaling the data without Location from pyspark.ml.feature import StandardScaler scaler_NoLocation = StandardScaler(inputCol="features_NoLocation", outputCol="scaledFeatures_NoLocation", withStd=True, withMean=False) # Compute summary statistics by fitting the StandardScaler scalerModel_NoLocation = scaler_NoLocation.fit(FeatureData_NoLocation) # Normalize each feature to have unit standard deviation. FinalData_NoLocation = scalerModel_NoLocation.transform(FeatureData_NoLocation) FinalData_NoLocation.toPandas().to_csv('HackDataFinal_NoLocation.csv') ################################################################################################################### #Scaling the data with Location after String Indexer scaler_LocationIndex = StandardScaler(inputCol="features_LocationIndex", outputCol="scaledFeatures_LocationIndex", withStd=True, withMean=False)
x for x in c if x != "date" and x != "longitude" and x != "latitude" and x != "cumLag" and "lag-" not in x ] # In[ ]: from pyspark.ml.feature import VectorAssembler, StandardScaler assembler = VectorAssembler(inputCols=c, outputCol="features") scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=True) df1 = assembler.setHandleInvalid("skip").transform(df) df1.printSchema() print("df1 count at this point is ", df1.count()) scalarModel = scaler.fit(df1) df1 = scalarModel.transform(df1) from pyspark.ml.feature import PCA pca = PCA(k=40, inputCol="scaledFeatures", outputCol="pcaFeatures") model = pca.fit(df1) result = model.transform(df1).select('date', 'latitude', 'longitude', 'pcaFeatures') # In[ ]: result = result.coalesce(200) result.write.parquet( "s3a://dse-cohort5-group5/wildfire_capstone/integratedData/completePCA", mode="overwrite", compression='gzip')
#result =( #pipeline_sel #.fit(clustering_df) #.transform(clustering_df) #) #print("ChiSqSelector output with top %d features selected" % selector.getNumTopFeatures()) #result.show() # standartizavimas standard_scaler = StandardScaler( inputCol="initial_features", outputCol="features", withStd=True, withMean=True) vectorized_df = vector_assembler.transform(clustering_df) model_scaler = standard_scaler.fit(vectorized_df) featurized_clustering_df = model_scaler.transform(vectorized_df) featurization_pipeline = Pipeline(stages=[vector_assembler, standard_scaler]) featurization_pipeline_model = featurization_pipeline.fit(clustering_df) model_scaler = featurization_pipeline_model.stages[-1] featurized_clustering_df = featurization_pipeline_model.transform(clustering_df) sse_cost = np.zeros(20) #path_metrics_kmeans_sse = "../data/metrics_kmeans_see.jsonl" # pradedu klasteriu parinkima metrikas saugau json faile, kartu sukuriau image faila # kuriame nupiesta kreive kurios pagalba rasime kiek klasteriu reikia for k in range(2,10): kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("features") model = kmeans.fit(featurized_clustering_df.sample(False,0.1, seed=42)) sse_cost[k] = model.computeCost(featurized_clustering_df) metrics_row = {"k": k, "sse": sse_cost[k]} # metrikas i json
cols = raw_data.columns cols.remove("Outcome") #let us import the vector assembler from pyspark.ml.feature import VectorAssembler assembler = VectorAssembler(inputCols=cols, outputCol="features") #now let us use the transform method raw_data = assembler.transform(raw_data) # print(raw_data.select("features").show(truncate=False)) #standard scaler from pyspark.ml.feature import StandardScaler standardscaler = StandardScaler().setInputCol("features").setOutputCol( "scaled_features") raw_data = standardscaler.fit(raw_data).transform(raw_data) # print(raw_data.select("features", "scaled_features").show()) #train test split train, test = raw_data.randomSplit([0.8, 0.2], seed=12345) #let us check whether their is imbalance in the dataset dataset_size = float(train.select("Outcome").count()) numPositives = train.select("Outcome").where('Outcome == 1').count() per_ones = (float(numPositives) / float(dataset_size)) * 100 numNegatives = float(dataset_size - numPositives) # print('The number of ones are {}'.format(numPositives)) # print('Percentage of ones are {}'.format(per_ones)) #Imbalance Dataset # In our dataset (train) we have 34.27 % positives and 65.73 % negatives. Since negatives are in a majority. Therefore,logistic loss objective function should treat the positive class (Outcome == 1) with higher weight. For this purpose we calculate the BalancingRatio as follows:
bucketer.transform(contDF).show() # COMMAND ---------- from pyspark.ml.feature import QuantileDiscretizer bucketer = QuantileDiscretizer().setNumBuckets(5).setInputCol("id") fittedBucketer = bucketer.fit(contDF) fittedBucketer.transform(contDF).show() # COMMAND ---------- from pyspark.ml.feature import StandardScaler sScaler = StandardScaler().setInputCol("features") sScaler.fit(scaleDF).transform(scaleDF).show() # COMMAND ---------- from pyspark.ml.feature import MinMaxScaler minMax = MinMaxScaler().setMin(5).setMax(10).setInputCol("features") fittedminMax = minMax.fit(scaleDF) fittedminMax.transform(scaleDF).show() # COMMAND ---------- from pyspark.ml.feature import MaxAbsScaler maScaler = MaxAbsScaler().setInputCol("features") fittedmaScaler = maScaler.fit(scaleDF)
trained_parameters = load_trained_parameters(trained_parameters, {"input_column": input_column,}) scaler_model, scaler_model_loaded = load_pyspark_model_from_trained_parameters( trained_parameters, MinMaxScalerModel, "scaler_model" ) if scaler_model is None: scaler = MinMaxScaler(inputCol=temp_vector_col, outputCol=temp_normalized_vector_col) scaler_model = fit_and_save_model(trained_parameters, "scaler_model", scaler, assembled_wo_nans) output_df = transform_using_trained_model(scaler_model, assembled, scaler_model_loaded) scaler = MaxAbsScaler(inputCol=temp_vector_col, outputCol=temp_normalized_vector_col) output_df = scaler.fit(assembled_wo_nans).transform(assembled) # convert the resulting vector back to numeric temp_flattened_vector_col = temp_col_name(output_df) output_df = output_df.withColumn(temp_flattened_vector_col, vector_to_array(temp_normalized_vector_col)) # keep only the final scaled column. output_column = input_column if output_column is None or not output_column else output_column output_column_value = sf.col(temp_flattened_vector_col)[0].alias(output_column) output_df = output_df.withColumn(output_column, output_column_value) final_columns = list(dict.fromkeys((list(df.columns) + [output_column]))) output_df = output_df.select(final_columns) return default_spark_with_trained_parameters(output_df, trained_parameters)
cols = [ 'Session_Connection_Time', 'Bytes_Transferred', 'Kali_Trace_Used', 'Servers_Corrupted', 'Pages_Corrupted', 'WPM_Typing_Speed' ] #Assembling The Features assembler = VectorAssembler(inputCols=cols, outputCol='features') #Creating the new Dataframe with Features assembled_data = assembler.transform(data) #Scaling the Features scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures') scaler_model = scaler.fit(assembled_data) scaled_data = scaler_model.transform(assembled_data) #Creating the Model k_means = KMeans(featuresCol='scaledFeatures', k=n) #Training The Model model = k_means.fit(scaled_data) #Prediction model_data = model.transform(scaled_data) #Grouping and Displaying By Cluster model_data.groupBy('prediction').count().show()
from __future__ import print_function from pyspark import SparkContext from pyspark.sql import SQLContext # $example on$ from pyspark.ml.feature import StandardScaler # $example off$ if __name__ == "__main__": sc = SparkContext(appName="StandardScalerExample") sqlContext = SQLContext(sc) # $example on$ dataFrame = sqlContext.read.format("libsvm").load( "data/mllib/sample_libsvm_data.txt") scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=False) # Compute summary statistics by fitting the StandardScaler scalerModel = scaler.fit(dataFrame) # Normalize each feature to have unit standard deviation. scaledData = scalerModel.transform(dataFrame) scaledData.show() # $example off$ sc.stop()
pickle.dump(dummy_info, open(os.path.expanduser(dummy_info_path["path"]), 'wb')) print("dummy_info saved in:\t" + dummy_info_path["path"]) # Feature columns features_x_name = list(set(usecols_x) - set(Y_name) - set(dummy_columns)) assembler_x = VectorAssembler(inputCols=features_x_name, outputCol="features_x_raw") data_sdf_i = assembler_x.transform(data_sdf_i) # Standardized the non-categorical data. scaler = StandardScaler(inputCol="features_x_raw", outputCol="features_x_std", withStd=True, withMean=True) scalerModel = scaler.fit(data_sdf_i) data_sdf_i = scalerModel.transform(data_sdf_i) # Assemble all vectors assembler_all = VectorAssembler( inputCols=["features_x_std", "features_ONEHOT"], outputCol="features") data_sdf_i = assembler_all.transform(data_sdf_i) # Model specification lr = SLogisticRegression( labelCol=Y_name, featuresCol="features", fitIntercept= fit_intercept, # Already standardized with non-dummy columns standardization=False ) # , maxIter=100, regParam=0.3, elasticNetParam=0.8)
training_random .withColumn( "Weight",F.when(F.col("Class") == 0, 1.0) .otherwise(10.0) #Class==1 10 ) ) test_weighted=test_random #=======Justify the choice of sampling method focusing on logistic regression #logistic regression #choose the proper K for PCA #using for training data only eg training_random standard_scaler = StandardScaler(inputCol="raw_Features", outputCol="scaled_features") standard_fit = standard_scaler.fit(training_random) standard_train=standard_fit.transform(training_random) pca = PCA(k=10, inputCol="scaled_features", outputCol="pca_features") model_pca = pca.fit(standard_train) #model_pca = pca.fit(standard_train) #model_pca.explainedVariance #Returns a vector of proportions of variance explained by each principal component. tt=[round(num,3) for num in model_pca.explainedVariance] print(tt) #[0.422, 0.306, 0.149, 0.071, 0.024, 0.015, 0.007, 0.004, 0.002, 0.0] plt.figure() plt.plot(range(1,11),model_pca.explainedVariance) plt.xlabel('Number of Components') plt.ylabel('Variance (%)') plt.title('Explained Variance') plt.savefig('PCA.png')
print "Loading RAW data..." raw_data = sc.textFile(data_file) print "Parsing dataset..." parsed_labelpoint = raw_data.map(kup.parse_multiClass) parsed_labelpoint_df = spark.createDataFrame(parsed_labelpoint, ["label", "features"]) print "Standardizing data..." scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=True) # train a scaler to perform feature scaling scalerModel = scaler.fit(parsed_labelpoint_df) shutil.rmtree(scalerPath, ignore_errors=True) scalerModel.save(scalerPath) # Normalize each feature to have unit standard deviation. train_df_tmp = scalerModel.transform(parsed_labelpoint_df) train_df = train_df_tmp.drop("features").withColumnRenamed( "scaledFeatures", "features") # show the frequency of each label tmp_df = train_df.groupBy("label").count() tmp_df.show(10) lr = LogisticRegression(maxIter=10, regParam=0.01, elasticNetParam=0.2) # instantiate the One Vs Rest Classifier.
percentile(t_tempo, 0.5) a_tempo from track, track_artists ta, artist where ta.track_id = track.t_id and artist.a_id = ta.artist_id group by a_id """.format(args.agg_table_name)) agg_table = spark.table(args.agg_table_name) existing_tables = [table.name for table in spark.catalog.listTables()] # K-means on artist features if args.feature_kmeans_table_name not in existing_tables: # normalize features va = VectorAssembler(inputCols=[column for column in agg_table.columns if column != "a_id"], outputCol="raw_features") feature_table = va.transform(agg_table) standard_scaler = StandardScaler(inputCol="raw_features", outputCol="features") feature_table = standard_scaler.fit(feature_table).transform(feature_table).select("a_id", "raw_features", "features") feature_table.show() # k-means kmeans = KMeans(k=100) model = kmeans.fit(feature_table) clustered = model.transform(feature_table).select("a_id", "prediction") #clustered.show() clustered.write.saveAsTable(args.feature_kmeans_table_name, format="orc", mode="error") if args.smoothed_kmeans_table_name not in existing_tables: # Compute artist collaboration graph as edge list with self-loop collaboration = spark.sql("select a.artist_id node, b.artist_id neighbor from track_artists a, track_artists b where a.track_id = b.track_id") # and a.artist_id != b.artist_id collaboration.registerTempTable("collaboration") # Smooth the features of artists by averaging over their neighbors. For artist with no collaborator, its features should remain unchanged. artist_features = spark.sql("""select node, avg(am.a_track_number) track_number, avg(am.a_mode) modality, avg(am.a_acousticness) acousticness, avg(am.a_danceability) danceability, avg(am.a_energy) energy,
def rescale_df(data): """Rescale the data.""" standardScaler = StandardScaler(inputCol="features", outputCol="features_scaled") scaler = standardScaler.fit(data) scaled_df = scaler.transform(data) return scaled_df
# Save the query output into a bucket query.write.save(sys.argv[2] + '/Query_Iris_' + str(date), format="json") # ML # Pre-process the data assembler = VectorAssembler( inputCols=['sepal_length', 'sepal_width','petal_length','petal_width'], outputCol="raw_features") vector_df = assembler.transform(df) # Scale features to have zero mean and unit standard deviation standarizer = StandardScaler(withMean=True, withStd=True, inputCol='raw_features', outputCol='features') model = standarizer.fit(vector_df) vector_df = model.transform(vector_df) # Convert label to number indexer = StringIndexer(inputCol="variety", outputCol="label") indexed = indexer.fit(vector_df).transform(vector_df) indexed.show(10) # Select features iris = indexed.select(['features', 'label']) # LR train, test = iris.randomSplit([0.7, 0.3]) lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10) LRmodel = lr.fit(train)
# In[26]: #scaler = Normalizer(inputCol="TitleAndBodyLengthVector", outputCol="ScaledNumFeatures") #df = scaler.transform(df) df.select(["id", "ScaledNumFeatures"]).where(df.Id == "512").collect() # # Question 4 # Using the StandardScaler method (scaling both the mean and the standard deviation) what's the normalized value for question Id = 512? # In[27]: scaler2 = StandardScaler(inputCol="TitleAndBodyLengthVector", outputCol="ScaledNumFeatures2", withStd=True) scalerModel = scaler2.fit(df) df = scalerModel.transform(df) df.select(["id", "ScaledNumFeatures2"]).where(df.Id == "512").collect() # # Question 5 # Using the MinMAxScaler method what's the normalized value for question Id = 512? # In[29]: from pyspark.ml.feature import MinMaxScaler scaler3 = MinMaxScaler(inputCol="TitleAndBodyLengthVector", outputCol="ScaledNumFeatures3") scalerModel3 = scaler3.fit(df) df = scalerModel3.transform(df) df.select(["id", "ScaledNumFeatures3"]).where(df.Id == "512").collect()
|[10.0,139.0,80.0,29.153419593345657,155.5482233502538,27.1,1.441,57.0] | |[1.0,189.0,60.0,23.0,846.0,30.1,0.398,59.0] | |[5.0,166.0,72.0,19.0,175.0,25.8,0.587,51.0] | |[7.0,100.0,72.40518417462484,29.153419593345657,155.5482233502538,30.0,0.484,32.0] | |[0.0,118.0,84.0,47.0,230.0,45.8,0.551,31.0] | |[7.0,107.0,74.0,29.153419593345657,155.5482233502538,29.6,0.254,31.0] | |[1.0,103.0,30.0,38.0,83.0,43.3,0.183,33.0] | |[1.0,115.0,70.0,30.0,96.0,34.6,0.529,32.0] | +-----------------------------------------------------------------------------------+ ''' ################################################################################## # StandardScaler to scalerize the newly created "feature" column ################################################################################## standardScalar = StandardScaler().setInputCol("features").setOutputCol( "Scaled_features") raw_data = standardScalar.fit(raw_data).transform(raw_data) raw_data.select("features", "Scaled_features").show(5, truncate=False) ''' +---------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------+ |[6.0,148.0,72.0,35.0,155.5482233502538,33.6,0.627,50.0] |[1.7806383732194306,4.862670805688543,5.952210601826984,3.9813708583353558,1.8295247783934943,4.887165154544966,1.8923811872495484,4.251616970894646] | |[1.0,85.0,66.0,29.0,155.5482233502538,26.6,0.351,31.0] |[0.29677306220323846,2.7927501248886903,5.456193051674735,3.29885013976358,1.8295247783934943,3.869005747348098,1.0593712866420917,2.6360025219546803]| |[8.0,183.0,64.0,29.153419593345657,155.5482233502538,23.3,0.672,32.0]|[2.3741844976259077,6.0126267394662385,5.290853868290652,3.316302148279125,1.8295247783934943,3.3890163125267176,2.0281980188703295,2.721034861372573]| |[1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0] |[0.29677306220323846,2.9241736601775696,5.456193051674735,2.616329421191805,1.1056078010080843,4.087182763175998,0.5040313529037872,1.785679127775751]| |[0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0] |[0.0,4.501256083644124,3.3067836676816578,3.9813708583353558,1.975979899674023,6.268952921455001,6.905531349963264,2.806067200790466] | +---------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------+ ''' #Split data into Training and Test train, test = raw_data.randomSplit([0.8, 0.2], seed=12345) #Let us check whether their is imbalance in the dataset dataset_size = float(train.select("Outcome").count())
def run_standard_scaler(t_data): standardscaler = StandardScaler().setInputCol("features").setOutputCol( "scaled_features") t_data = standardscaler.fit(t_data).transform(t_data) return t_data
# In[46]: #Scaling Data prior to SMOTE # In[47]: scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=True) # In[48]: scalerModel = scaler.fit(df_baseline) # In[49]: scaledData = scalerModel.transform(df_baseline) # In[50]: scaledData = scaledData.drop("features") # In[51]:
train, test = df_transformed.randomSplit( [float(sys.argv[1]), float(sys.argv[2])], seed=7) #train60,test40 = df_transformed.randomSplit([0.6,0.4],seed=7) #train70,test30 = df_transformed.randomSplit([0.7, 0.3], seed=7) #train80,test20 = df_transformed.randomSplit([0.8,0.2],seed=7) #train90,test10 = df_transformed.randomSplit([0.9,0.1],seed=7) logger.error("#### after split") logger.error("#### standardscaler on train dataset") standardizer = StandardScaler(withMean=True, withStd=True, inputCol='features', outputCol='std_features') standardizer_model = standardizer.fit(train) standardized_features_df70 = standardizer_model.transform(train) logger.error("#### standardscaler on test dataset") standardizer = StandardScaler(withMean=True, withStd=True, inputCol='features', outputCol='std_features') standardizer_model = standardizer.fit(test) standardized_features_df30 = standardizer_model.transform(test) from pyspark.ml.feature import VectorAssembler, StandardScaler, PCA logger.error("###### pca on standarded scaler using train") pca = PCA(k=2, inputCol="std_features", outputCol="pca_features")
# scale the data : because all are used to calculate the distance ,they are should be in the same scale workdingDF.columns # not use rowID (result is be stored) # max_wind_speed has a high correlation with the wind* ,not incude them either featureColumns = ['air_pressure','air_temp','avg_wind_direction','avg_wind_speed','max_wind_direction','max_wind_speed','relative_humidity'] assembler = VectorAssembler(inputCols=featureColumns,outputCol='features_unscaled') assembled = assembler.transform(workdingDF) # scale # (each column - mean /std) =====mean = 0 scaler = StandardScaler(inputCol='features_unscaled',outputCol='features',withStd=True,withMean=True) scaleModel = scaler.fit(assembled) scaleData = ScaleModel.transform(assembled) #(X-mean)/std 计算时对每个属性/每列分别进行。 ''' 将数据按期属性(按列进行)减去其均值,并处以其方差。得到的结果是,对于每个属性/每列来说所有数据都聚集在0附近,方差为1。 ''' # create elbow plot to see the number of centers #This method involves applying k-means, using different values for k, and calculating the within-cluster sum-of-squared error (WSSE). Since this means applying k-means multiple times, this process can be very compute-intensive. To speed up the process, we will use only a subset of the dataset. We will take every third sample from the dataset to create this subset: scaleData = ScaleData.select('features','rowID') elbowset = scale.filter((scaleData.rowID % 3 == 0)).select('features') elbowset.persist() #The last line calls the persist() method to tell Spark to keep the data in memory (if possible), which will speed up the computations. clusters = range(2,31)
def main(input, model_file): # Defining the schema for Lift1 datasets def sensor_schema(): sen_schema = types.StructType([ types.StructField('timestamp', types.StringType()), types.StructField('X', types.DoubleType()), types.StructField('Y', types.DoubleType()), types.StructField('Z', types.DoubleType()), ]) return sen_schema def calc_score(count, min_max_collection): min_count = float(min_max_collection[0]) max_count = float(min_max_collection[1]) score = (max_count - count) / (max_count - min_count) return score sens_schema = sensor_schema() #Spark read of data temp = spark.read.csv(input, schema=sens_schema) # Selecting time range from 07/09 to 08/09 . Other data are useless temp.createOrReplaceTempView("temp") temp = spark.sql( "select timestamp,Z from temp where timestamp between '2018-07-09 12:00:00' and '2018-08-09 12:00:00'" ) # The below code is to apply a standard scale to achieve Z normalization. This will ensure to make mean as 0 and standard deviation as 1. # UDF for converting column type from vector to double type unlist = udf(lambda x: round(float(list(x)[0]), 6), types.DoubleType()) assembler = VectorAssembler(inputCols=["Z"], outputCol="Zvector") tempdata = assembler.transform(temp) scaler = StandardScaler(inputCol="Zvector", outputCol="Zscale", withMean=True, withStd=True) scalerModel = scaler.fit(tempdata) scaledData = scalerModel.transform(tempdata).withColumn( "Zscale", unlist("Zscale")).drop("Zvector").cache() scaledData.show() #Conversion of timestamp string to timestamp type. This is for smoothing purpose scaledData = scaledData.withColumn( "times", to_timestamp("timestamp", 'yyyy-MM-dd HH:mm:ss')).cache() #Obtain moving averages movAvg = scaledData.withColumn( "movingAverage", avg(scaledData["Zscale"]).over( Window.partitionBy(scaledData["times"]).rowsBetween(-3, 3))).cache() movAvg.show() #Round the Zscale value to 0 movAvg.createOrReplaceTempView("movAvg") scaledNorm = spark.sql( "select times,Zscale,round(movingAverage,0) as Zround from movAvg" ).cache() scaledNorm.show() #Feature transform for K means cols = ["Zscale", "Zround"] ft_assembler = VectorAssembler(inputCols=cols, outputCol="features") in_df = ft_assembler.transform(scaledNorm) kmeans = KMeans().setK(5).setSeed(1) model = kmeans.fit(in_df) # Make predictions predict = model.transform(in_df).cache() predict.show() #Evaluate clustering by computing Silhouette score evaluator = ClusteringEvaluator() silhouette = evaluator.evaluate(predict) print("Silhouette with squared euclidean distance = " + str(silhouette)) #Shows the result centers = model.clusterCenters() print("Cluster Centers: ") for center in centers: print(center) #Saving the model model.write().overwrite().save(model_file) #Calculate the total count of each cluster count_df = predict.groupBy("prediction").count().cache() count_df.show() #count_df.createOrReplaceTempView("count_df") #min_max_list = spark.sql("select min(count) as min,max(count) as max from count_df group by count").collect()[0] min_max_list = count_df.agg(min('count'), max('count')).collect()[0] print(min_max_list) #Calculating the scores udf_calc_score = udf(lambda count: calc_score(float(count), min_max_list), types.FloatType()) anom_score = count_df.withColumn("score", udf_calc_score("count")).cache() anom_score.show() #Populating scores predict = predict.join(anom_score, "prediction").select("times", "Zscale", "Zround", "prediction", "score") predict.show() #Anomaly detection based om threshold anomaly = predict.where(predict["score"] > 0.9999) anomaly.show() #Writing to a csv file anomaly.coalesce(1).orderBy("times").write.csv("kmeansout")
df = df.withColumn("THD18th", functions.col("THD18th") * 100) \ .withColumn("THD42th", functions.col("THD42th") * 100) # Convert to percentage values, 注意换行符后面不能有空格,否则报错! df = df.select("Hours", "THD18th") # Select 'Hours' & 'THD18th' for analysis df.show(24) input_data = df.rdd.map( lambda x: (x[0], DenseVector(x[1:]))) # Create new dataframe with lables labeled_df = sqlContext.createDataFrame(input_data, ["label", "features"]) labeled_df.show(24) standardScaler = StandardScaler(inputCol="features", outputCol="features_scaled") # Re-scaling scaler = standardScaler.fit(labeled_df) scaled_df = scaler.transform(labeled_df) scaled_df.show(24) train_data, test_data = scaled_df.randomSplit( [0.7, 0.3]) # Randomly choose 30% as test data test_data.show(24) #lr = LinearRegression(labelCol="label", maxIter=10, regParam=0.3, elasticNetParam=0.8) # Train models lr = GeneralizedLinearRegression(family="gaussian", link="identity", maxIter=10, regParam=0.3) linearModel = lr.fit(train_data) predicted = linearModel.transform(test_data) # Prediction
def transform(spark, s3_input_data, s3_output_train_data, s3_output_validation_data, s3_output_test_data): print('Processing {} => {}'.format(s3_input_data, s3_output_train_data, s3_output_validation_data, s3_output_test_data)) schema = StructType([ StructField('marketplace', StringType(), True), StructField('customer_id', StringType(), True), StructField('review_id', StringType(), True), StructField('product_id', StringType(), True), StructField('product_parent', StringType(), True), StructField('product_title', StringType(), True), StructField('product_category', StringType(), True), StructField('star_rating', IntegerType(), True), StructField('helpful_votes', IntegerType(), True), StructField('total_votes', IntegerType(), True), StructField('vine', StringType(), True), StructField('verified_purchase', StringType(), True), StructField('review_headline', StringType(), True), StructField('review_body', StringType(), True), StructField('review_date', StringType(), True) ]) df_csv = spark.read.csv(path=s3_input_data, sep='\t', schema=schema, header=True, quote=None) df_csv.show() # This dataset should already be clean, but always good to double-check print('Showing null review_body rows...') df_csv.where(col('review_body').isNull()).show() df_csv_cleaned = df_csv.na.drop(subset=['review_body']) df_csv_cleaned.where(col('review_body').isNull()).show() tokenizer = Tokenizer(inputCol='review_body', outputCol='words') wordsData = tokenizer.transform(df_csv_cleaned) hashingTF = HashingTF(inputCol='words', outputCol='raw_features', numFeatures=1000) featurizedData = hashingTF.transform(wordsData) # While applying HashingTF only needs a single pass to the data, applying IDF needs two passes: # 1) compute the IDF vector # 2) scale the term frequencies by IDF # Therefore, we cache the result of the HashingTF transformation above to speed up the 2nd pass featurizedData.cache() # spark.mllib's IDF implementation provides an option for ignoring terms # which occur in less than a minimum number of documents. # In such cases, the IDF for these terms is set to 0. # This feature can be used by passing the minDocFreq value to the IDF constructor. idf = IDF(inputCol='raw_features', outputCol='features') #, minDocFreq=2) idfModel = idf.fit(featurizedData) features_df = idfModel.transform(featurizedData) features_df.select('star_rating', 'features').show() num_features = 300 pca = PCA(k=num_features, inputCol='features', outputCol='pca_features') pca_model = pca.fit(features_df) pca_features_df = pca_model.transform(features_df).select( 'star_rating', 'pca_features') pca_features_df.show(truncate=False) standard_scaler = StandardScaler(inputCol='pca_features', outputCol='scaled_pca_features') standard_scaler_model = standard_scaler.fit(pca_features_df) standard_scaler_features_df = standard_scaler_model.transform( pca_features_df).select('star_rating', 'scaled_pca_features') standard_scaler_features_df.show(truncate=False) expanded_features_df = (standard_scaler_features_df.withColumn( 'f', to_array(col('scaled_pca_features'))).select( ['star_rating'] + [col('f')[i] for i in range(num_features)])) expanded_features_df.show() train_df, validation_df, test_df = expanded_features_df.randomSplit( [0.9, 0.05, 0.05]) train_df.write.csv(path=s3_output_train_data, header=None, quote=None) print('Wrote to output file: {}'.format(s3_output_train_data)) validation_df.write.csv(path=s3_output_validation_data, header=None, quote=None) print('Wrote to output file: {}'.format(s3_output_validation_data)) test_df.write.csv(path=s3_output_test_data, header=None, quote=None) print('Wrote to output file: {}'.format(s3_output_test_data))
from pyspark.ml.feature import VectorAssembler, StandardScaler from pyspark.ml.regression import LinearRegression # Pull our data into a Spark dataframe df = spark.sql("select * from sensor_readings") # Extract the columns that we want in our feature vector featureColumns = df.drop("timestamp","Sensor-Predict").columns # First we will use `VectorAssembler` to combine all feature columns into a feature vector (optimized data structure for ML) assembler = VectorAssembler(inputCols=featureColumns, outputCol="featureVector") dfVector = assembler.transform(df) # Then we will scale the values of each sensor to have a standard mean and deviation scaler = StandardScaler(inputCol="featureVector", outputCol="features", withStd=True, withMean=False) dfScaled = scaler.fit(dfVector).transform(dfVector) display(dfScaled.select("features","Sensor-Predict")) # COMMAND ---------- # MAGIC %md # MAGIC ## Model Training # MAGIC With our scaled and vectorized feature set, we can now train a linear regression model against the data. # MAGIC # MAGIC Databricks can also visualize model residuals, as well as ROC curves and decision trees. # COMMAND ---------- # Split the data into a training and test dataset (trainingData, testingData) = dfScaled.randomSplit([0.7, 0.3])
hashingTF = HashingTF(numFeatures=285, inputCol='concat_(stop_words, com_skips)', outputCol='features') tf1 = hashingTF.transform(df_all_words1) # Normalize the counts so that they are a percentage of total counts of the features tf_norm1 = Normalizer(inputCol="features", outputCol="features_norm", p=1).transform(tf1) # Standardize the vector based on average use of each feature among all users stdscaler = StandardScaler(inputCol='features_norm', outputCol='scaled', withMean=True) scale_fit1 = stdscaler.fit(tf_norm1) scaled1 = scale_fit1.transform(tf_norm1) # Do all of the above for subset #2 comments2 = df2.groupBy("author").agg(F.collect_list("body")) join_comments_udf = udf(lambda x: ' '.join(x), StringType()) df2_join_comments = comments2.withColumn( 'corpus', join_comments_udf(comments2['collect_list(body)'])) df_count_links2 = df2_join_comments.withColumn( 'link_count', count_links_udf(df2_join_comments['corpus'])) df_drop_links2 = df_count_links2.withColumn( 'corpus', drop_links_udf(df_count_links2['corpus']))
def initialize(self, do_scaling=True, do_onehot=True): """Reads the dataset, initializes class members. features_df: Original DataFrame as read from the features_file. train_df: A DataFrame with columns Lat, Lon, Pickup_Count and vector columns Features & ScaledFeatures. Contains only data before 2015. test_df: As train_df, but only containing data of 2015. districts_with_counts: A DataFrame with all districts and their counts. """ # Read feature dataframe self.features_df = self.sql_context.read.parquet( self.features_file).cache() # Set exclude columns to default exclude_columns = self.EXCLUDE_COLUMNS # Scale features if do_scaling: assembler = VectorAssembler(inputCols=self.SCALE_COLUMNS, outputCol='FeaturesToScale') self.features_df = assembler.transform(self.features_df) scaler = StandardScaler(inputCol='FeaturesToScale', outputCol=('ScaledFeatures'), withStd=True, withMean=False) self.features_df = scaler.fit(self.features_df).transform( self.features_df) exclude_columns += self.SCALE_COLUMNS + ['FeaturesToScale'] # Adopt categorical features that do not have a value range of [0, numCategories) for column in ['Day', 'Month', 'Day_Of_Year']: if column in self.features_df.columns: self.features_df = self.features_df.withColumn( column, self.features_df[column] - 1) # Encode categorical features using one-hot encoding if do_onehot: vec_category_columns = [ '%s_Vector' % column for column in self.ONE_HOT_COLUMNS ] for i in range(len(self.ONE_HOT_COLUMNS)): column = self.ONE_HOT_COLUMNS[i] if column in self.features_df.columns: self.features_df = self.features_df.withColumn( column, self.features_df[column].cast(DoubleType())) encoder = OneHotEncoder(inputCol=column, outputCol=vec_category_columns[i], dropLast=False) self.features_df = encoder.transform(self.features_df) exclude_columns += self.ONE_HOT_COLUMNS # Vectorize features feature_columns = [ column for column in self.features_df.columns if column not in exclude_columns ] assembler = VectorAssembler(inputCols=feature_columns, outputCol='Features') self.features_df = assembler.transform(self.features_df) # Set number of distinct values for categorical features (identified by index) self.categorical_features_info = {} if not do_onehot: self.categorical_features_info = { i: self.CATEGORY_VALUES_COUNT[feature_columns[i]] for i in range(len(feature_columns)) if feature_columns[i] in self.CATEGORY_VALUES_COUNT.keys() } # Split into train and test data split_date = datetime(2015, 1, 1) self.train_df = self.features_df.filter( self.features_df.Time < split_date).cache() self.test_df = self.features_df.filter( self.features_df.Time > split_date).cache() # Compute Districts with counts self.districts_with_counts = self.features_df \ .groupBy([self.features_df.Lat, self.features_df.Lon]) \ .count()
outputCol="DenseVector") train_df = vectorAssembler.transform(train_df) ''' Done to standardise data ''' stand_scaled = StandardScaler(inputCol="DenseVector", outputCol="features", withStd=True, withMean=True) ''' outputCol must be named Features as Spark KMeans will only use that column as input ''' scaled_model = stand_scaled.fit(train_df) train_df = scaled_model.transform(train_df) bkmeans = BisectingKMeans().setK(2) bkmeans = bkmeans.setSeed(1) bkmodel = bkmeans.fit(train_df) bkcenters = bkmodel.clusterCenters() if bkmodel.hasSummary: print(bkmodel.summary.clusterSizes) print(bkmodel.clusterCenters()) predict_df = bkmodel.transform(train_df) predict_df = predict_df.select("avgMeasuredTime", "avgSpeed", "vehicleCount",
from pyspark.sql import SparkSession from pyspark.ml.clustering import KMeans from pyspark.ml.feature import VectorAssembler, StandardScaler from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator spark = SparkSession.builder.appName("clustering").getOrCreate() df = spark.read.csv("./files/seeds_dataset.csv", inferSchema=True, header=True) # df.show() assembler = VectorAssembler(inputCols=df.columns, outputCol='features') data = assembler.transform(df) scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures') scaled_data = scaler.fit(data).transform(data) kmeans = KMeans(featuresCol='scaledFeatures').setK(3) model = kmeans.fit(scaled_data) # print("WSSSE") # print(model.computeCost(scaled_data)) print(model.clusterCenters()) model.transform(scaled_data).select('prediction').show()
# COMMAND ---------- display(output) # COMMAND ---------- # MAGIC %md # MAGIC ####Scaling the Data # COMMAND ---------- scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=True) # Compute summary statistics by fitting the StandardScaler scalerModel = scaler.fit(output) # Normalize each feature to have unit standard deviation. scaledData = scalerModel.transform(output) # COMMAND ---------- display(scaledData) # COMMAND ---------- # MAGIC %md # MAGIC ####Principle Component Analysis # COMMAND ----------
transformed = pipeline_model.transform(freqItemsets) new_transformed = extractTopics(transformed) dislpay(new_transformed) # COMMAND ---------- #PCA #poco significativa la visualizzazione from pyspark.ml.feature import PCA from pyspark.ml.feature import StandardScaler #normalizzazione prima della PCA counts = CountVectorizer(inputCol = "items", outputCol="raw_features2", vocabSize = 10000, minDF = 2.0) counter = counts.fit(new_transformed) counted = counter.transform(new_transformed) scaler = StandardScaler(inputCol="raw_features2", outputCol="scaledFeatures",withStd=True, withMean=False) scalerModel = scaler.fit(counted) scaledData = scalerModel.transform(counted) pca = PCA(k=2, inputCol = "raw_features2", outputCol = "pca") model = pca.fit(scaledData) transformed_df = model.transform(scaledData) display(transformed_df) # COMMAND ---------- #la PCA è inefficace per la riduzione della dimensionalità! #quindi se il clustering non viene visualizzato bene, è normale #d'altra parte non possiamo giocare su altro per migliorare la PCA #il punto è che abbiamo dati ad alta dimensionalità, con vettori molto sparsi e non possiamo scegliere le componenti model.explainedVariance
data = tf_idf_features_quora(data) # Get the text features data = text_features(data) # combine all the features feature_assembler = VectorAssembler( inputCols=["tf_idf_features", "text_features"], outputCol="combined_features" ) data = feature_assembler.transform(data) # Normalizing each feature to have unit standard deviation scaler = StandardScaler(inputCol="combined_features", outputCol="features", withStd=True, withMean=False) scalerModel = scaler.fit(data) # Normalize each feature to have unit standard deviation. data = scalerModel.transform(data) # Index labels, adding metadata to the label column. # Fit on whole dataset to include all labels in index. label_indexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data) # Automatically identify categorical features, and index them. feature_indexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=2).fit(data) training_df, test_df = data.randomSplit([0.8, 0.2]) training_df.cache() test_df.cache()
def initialize(self, do_scaling=True, do_onehot=True): """Reads the dataset, initializes class members. features_df: Original DataFrame as read from the features_file. train_df: A DataFrame with columns Lat, Lon, Pickup_Count and vector columns Features & ScaledFeatures. Contains only data before 2015. test_df: As train_df, but only containing data of 2015. districts_with_counts: A DataFrame with all districts and their counts. """ # Read feature dataframe self.features_df = self.sql_context.read.parquet(self.features_file).cache() # Set exclude columns to default exclude_columns = self.EXCLUDE_COLUMNS # Scale features if do_scaling: assembler = VectorAssembler(inputCols=self.SCALE_COLUMNS, outputCol='FeaturesToScale') self.features_df = assembler.transform(self.features_df) scaler = StandardScaler(inputCol='FeaturesToScale', outputCol=('ScaledFeatures'), withStd=True, withMean=False) self.features_df = scaler.fit(self.features_df).transform(self.features_df) exclude_columns += self.SCALE_COLUMNS + ['FeaturesToScale'] # Adopt categorical features that do not have a value range of [0, numCategories) for column in ['Day', 'Month', 'Day_Of_Year']: if column in self.features_df.columns: self.features_df = self.features_df.withColumn(column, self.features_df[column] - 1) # Encode categorical features using one-hot encoding if do_onehot: vec_category_columns = ['%s_Vector' % column for column in self.ONE_HOT_COLUMNS] for i in range(len(self.ONE_HOT_COLUMNS)): column = self.ONE_HOT_COLUMNS[i] if column in self.features_df.columns: self.features_df = self.features_df.withColumn(column, self.features_df[column].cast(DoubleType())) encoder = OneHotEncoder(inputCol=column, outputCol=vec_category_columns[i], dropLast=False) self.features_df = encoder.transform(self.features_df) exclude_columns += self.ONE_HOT_COLUMNS # Vectorize features feature_columns = [column for column in self.features_df.columns if column not in exclude_columns] assembler = VectorAssembler(inputCols=feature_columns, outputCol='Features') self.features_df = assembler.transform(self.features_df) # Set number of distinct values for categorical features (identified by index) self.categorical_features_info = {} if not do_onehot: self.categorical_features_info = {i:self.CATEGORY_VALUES_COUNT[feature_columns[i]] for i in range(len(feature_columns)) if feature_columns[i] in self.CATEGORY_VALUES_COUNT.keys()} # Split into train and test data split_date = datetime(2015, 1, 1) self.train_df = self.features_df.filter(self.features_df.Time < split_date).cache() self.test_df = self.features_df.filter(self.features_df.Time > split_date).cache() # Compute Districts with counts self.districts_with_counts = self.features_df \ .groupBy([self.features_df.Lat, self.features_df.Lon]) \ .count()