# real example users_noscaled=users_addedmonths from pyspark.ml.feature import VectorAssembler from pyspark.ml.feature import MinMaxScaler #call the vector assembler assembler = VectorAssembler( inputCols=users_noscaled.columns[7:], outputCol='assembled_col' ) #call the scaler scaler = MinMaxScaler( inputCol="assembled_col", outputCol="assembled_col_norm" ) #build an assembleed vector in the dataframe assembled=assembler.transform(users_noscaled) #build the scaler model scaler_model= scaler.fit(assembled) #Apply the model to the transformed dataframe users_wscaled=scaler_model.transform(assembled)
def scaleVecCol(self, columns, nameOutputCol): """ This function groups the columns specified and put them in a list array in one column, then a scale process is made. The scaling proccedure is spark scaling default (see the example bellow). +---------+----------+ |Price |AreaLiving| +---------+----------+ |1261706.9|16 | |1263607.9|16 | |1109960.0|19 | |978277.0 |19 | |885000.0 |19 | +---------+----------+ | | | V +----------------------------------------+ |['Price', 'AreaLiving'] | +----------------------------------------+ |[0.1673858972637624,0.5] | |[0.08966137157852398,0.3611111111111111]| |[0.11587093205757598,0.3888888888888889]| |[0.1139820728616421,0.3888888888888889] | |[0.12260126542983639,0.4722222222222222]| +----------------------------------------+ only showing top 5 rows """ # Check if columns argument must be a string or list datatype: self.__assertTypeStrOrList(columns, "columns") # Check if columns to be process are in dataframe self.__assertColsInDF(columnsProvided=columns, columnsDF=self.__df.columns) # Check if nameOutputCol argument a string datatype: self.__assertTypeStr(nameOutputCol, "nameOutpuCol") # Model to use vectorAssember: vecAssembler = VectorAssembler(inputCols=columns, outputCol="features_assembler") # Model for scaling feature column: mmScaler = MinMaxScaler(inputCol="features_assembler", outputCol=nameOutputCol) # Dataframe with feature_assembler column tempDF = vecAssembler.transform(self.__df) # Fitting scaler model with transformed dataframe model = mmScaler.fit(tempDF) exprs = list(filter(lambda x: x not in columns, self.__df.columns)) exprs.extend([nameOutputCol]) self.__df = model.transform(tempDF).select(*exprs) self.__addTransformation() # checkpoint in case return self
Vectors.dense([1.0, 0.1, -1.0]), ), ( 1, Vectors.dense([2.0, 1.1, 1.0]), ), ( 2, Vectors.dense([3.0, 10.1, 3.0]), )], ["id", "features"]) # In[20]: dataFrame.show() # In[21]: scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures") # In[22]: scaler # In[23]: # Compute summary statistics and generate MinMaxScalerModel scalerModel = scaler.fit(dataFrame) # In[24]: scalerModel # In[25]:
df_transformed.show() logger.error("#### before spliting") train, test = df_transformed.randomSplit( [float(sys.argv[1]), float(sys.argv[2])], seed=7) #train60,test40 = df_transformed.randomSplit([0.6,0.4],seed=7) #train70,test30 = df_transformed.randomSplit([0.7, 0.3], seed=7) #train80,test20 = df_transformed.randomSplit([0.8,0.2],seed=7) #train90,test10 = df_transformed.randomSplit([0.9,0.1],seed=7) logger.error("#### after split") logger.error("#### Random Forest") from pyspark.ml.classification import RandomForestClassifier minmax = MinMaxScaler(inputCol="features", outputCol="normFeatures") rf = RandomForestClassifier(featuresCol='normFeatures', labelCol='label') stages2 = [] #stages += string_indexer #stages += one_hot_encoder #stages2 += [vector_assembler] stages2 += [minmax] stages2 += [rf] from pyspark.ml import Pipeline pipeline2 = Pipeline().setStages(stages2) rf_model60 = pipeline2.fit(train)
# Step - 4: Make Vectors from dataframe's columns using special Vector Assmebler assembler = VectorAssembler(inputCols=[ "pclass_imputed", "sibsp_imputed", "parch_imputed", "sexIndexed_imputed", "embarkedIndexed_imputed", "age_imputed", "fare_imputed" ], outputCol="unscaled_features") # Step - 5: Define Polynomial Expansion with degree=2 polyExpansion = PolynomialExpansion(degree=2, inputCol="unscaled_features", outputCol="polyFeatures") # Step - 5: Define Scaler scaler = MinMaxScaler(inputCol="polyFeatures", outputCol="unnorm_features") # Step - 6: Define Normalizer normalizer = Normalizer(p=1.0, inputCol="unnorm_features", outputCol="features") # Step - 7: Set up the Decision Tree Classifier trainer = DecisionTreeClassifier(labelCol="survived", featuresCol="features") # Step - 8: Build the Pipeline pipeline = Pipeline(stages=[ sexIndexer, embarkedIndexer, imputer,
def main(): args = parse_arguments() setup_spark() df = read_file(args) # transform assembler = VectorAssembler(inputCols=["hour"], outputCol="hour_vector") df = assembler.transform(df) indexers = [StringIndexer(inputCol=column, outputCol=column + "_idx").fit(df) for column in list(set(df.columns) - set(['id', 'device_ip', 'hour', 'click', 'hour_vector', 'device_id', 'device_model', 'site_domain', 'site_id', 'app_id', 'c14', 'app_domain', 'c17', 'c20']))] \ + [MinMaxScaler(inputCol='hour_vector', outputCol='hour_scalar').fit(df)] pipeline = Pipeline(stages=indexers) df = pipeline.fit(df).transform(df) func = udf(lambda v: float(v[0]), FloatType()) df = df.withColumn('hour_std', func('hour_scalar')) df = df[[w for w in list(df.columns) if 'idx' in w] + ['hour_std', 'click']].cache() # to pandas and make config config_pd = df.agg(*(countDistinct(col(c)).alias(c) for c in df.columns)).toPandas() multi_index = [] for c in config_pd.columns: if '_idx' in c: multi_index.append('sparse') elif c == 'click': multi_index.append('label') else: multi_index.append('dense') config_pd.columns = pd.MultiIndex.from_tuples(zip(multi_index, config_pd.columns)) s = config_pd.iloc[0] dic = {l: s.xs(l).to_dict() for l in s.index.levels[0]} if not os.path.exists(args.output_path): os.system('mkdir {}'.format(args.output_path)) with open(os.path.join(args.output_path, 'config.yaml'), 'w', encoding="utf-8") as fw: yaml.dump(dic, fw, default_flow_style=False, indent=4) # stats count total_num = df.count() pos_num = df.filter(df.click == 1).count() neg_num = df.filter(df.click != 1).count() print('#'*20) print('raw totle_num:{} pos_num:{} neg_num:{}'.format(total_num, pos_num, neg_num)) # sample pos_df = df[df.click == 1] neg_df = df[df.click != 1].sample(False, 0.5, seed=1234) df = pos_df.union(neg_df) print('union totle_num:{} pos_num:{} neg_num:{}'.format(df.count(), pos_df.count(), neg_df.count())) print('#'*20) # split dataset train_df, val_df =df.randomSplit([0.9, 0.1]) train_df.repartition(1).write.json(os.path.join(args.output_path, 'train')) val_df.repartition(1).write.json(os.path.join(args.output_path, 'val'))
# COMMAND ---------- # Import RandomClassifier Algorithm from pyspark.ml.classification import RandomForestClassifier # Convert following feature column in one vector for train data assembler = VectorAssembler(inputCols=[ "display_id", "document_id", "platform", "ad_id", "campaign_id", "advertiser_id" ], outputCol="normfeatures") #assembler = VectorAssembler(inputCols = ["clicked"],outputCol="label") #Normlize feature data minMax = MinMaxScaler(inputCol=assembler.getOutputCol(), outputCol="nfeatures") #Convert Normlize feature data to vector featVect = VectorAssembler(inputCols=["nfeatures"], outputCol="features") #following Random forest algorithm train the classifiction model dt = RandomForestClassifier(labelCol="label", featuresCol="features", impurity="gini", featureSubsetStrategy="auto", numTrees=10, maxDepth=30, maxBins=128, seed=1234) # Following command will create pipeline with different stages
df.cache() print("Creating Splits") train, test = df.randomSplit([0.7, 0.3]) print("Selected Features Count: {0}".format(len(feature_cols))) print("Selected Features: {0}".format(feature_cols)) print("Building Pipeline") categorical_hasher = FeatureHasher(inputCols=categorical_cols, outputCol="categorical_features", categoricalCols=categorical_cols) continuous_vector = VectorAssembler(inputCols=continuous_cols, outputCol="continuous_vector") scaler = MinMaxScaler(min=0.0, max=1.0, inputCol=continuous_vector.getOutputCol(), outputCol="continuous_features") features = VectorAssembler(inputCols=feature_cols, outputCol="features") bayes = NaiveBayes(smoothing=1.0, featuresCol="features", labelCol="HasDetections", predictionCol="prediction", modelType="multinomial") pipeline = Pipeline( stages=[categorical_hasher, continuous_vector, scaler, features, bayes]) evaluator = MulticlassClassificationEvaluator(labelCol="HasDetections", predictionCol="prediction", metricName="accuracy") print("Configuring CrossValidation") params = ParamGridBuilder() \
def train_scaler(df, inputCol, outputCol): scaler = MinMaxScaler(inputCol=inputCol, outputCol=outputCol) return scaler.fit(df)
df_cluster_pop = df_cluster_raw_pop.select( ith("features", lit(0)).alias('lon'), ith("features", lit(1)).alias('lat'), ith("features", lit(2)).alias('pop')) df_cluster_granny = df_cluster_raw_granny.select( ith("features", lit(0)).alias('lon'), ith("features", lit(1)).alias('lat'), ith("features", lit(2)).alias('granny')) # Iterating over columns to be scaled for i in ["pop", "granny"]: # VectorAssembler Transformation - Converting column to vector type assembler = VectorAssembler(inputCols=[i], outputCol=i + "_Vect") # MinMaxScaler Transformation scaler = MinMaxScaler(inputCol=i + "_Vect", outputCol=i + "_Scaled") # Pipeline of VectorAssembler and MinMaxScaler pipeline = Pipeline(stages=[assembler, scaler]) # Fitting pipeline on dataframe df_cluster = pipeline.fit(df_cluster).transform(df_cluster).withColumn( i + "_Scaled", unlist(i + "_Scaled")).drop(i + "_Vect") df_cluster = df_cluster.select(df_cluster.lon, df_cluster.lat, df_cluster.pop_Scaled.alias('pop'), df_cluster.granny_Scaled.alias('granny')) for row in df_cluster.collect(): feature = { "type": "Feature",
s = x[1] + y[1] return (z, s) spark = SparkSession \ .builder \ .appName("KMeans") \ .config("spark.some.config.option", "Angadpreet-KMeans") \ .getOrCreate() today = dt.datetime.today() # Getting the data structure and scaling spark_df = sc.parallelize( spark.read.json("Data/yelp_academic_dataset_user.json").select( "review_count", "average_stars", "yelping_since").rdd.map(lambda x: (x[ 0], x[1], (today - par.parse(x[2])).days)).collect()[:1200]) scaler = MinMaxScaler(inputCol="_1",\ outputCol="scaled_1") trial_df = spark_df.map(lambda x: pyspark.ml.linalg.Vectors.dense(x)).map( lambda x: (x, )).toDF() scalerModel = scaler.fit(trial_df) vec_df = spark.createDataFrame( scalerModel.transform(trial_df).select("scaled_1").rdd.map( lambda x: (float(x[0][0]), float(x[0][1]), float(x[0][2])))) # Create RowMatrix from the transpose of spark_df = spark.createDataFrame(vec_df.toPandas().transpose()).rdd vector_df = sc.parallelize(spark_df.map(lambda s: Vectors.dense(s)).collect()) mat = RowMatrix(vector_df) bun = mat.rows.collect() num_clusters = 4 pre = sc.parallelize(mat.columnSimilarities().entries.map(
row["max_active"], row["std_active"], row["min_idle"], row["mean_idle"], row["max_idle"], row["std_idle"], row["sflow_fpackets"], row["sflow_fbytes"], row["sflow_bpackets"], row["sflow_bbytes"], row["fpsh_cnt"], row["bpsh_cnt"], row["furg_cnt"], row["burg_cnt"], row["total_fhlen"], row["total_bhlen"], row["dscp"] ])) return obj fluxoRDD4 = fluxoDF.rdd.map(transformaVar) fluxoDF = spSession.createDataFrame(fluxoRDD4, ["rotulo", "atributos"]) scaler = MinMaxScaler(inputCol="atributos", outputCol="scaledFeatures", min=0.0, max=1.0) scalerModel = scaler.fit(fluxoDF) scaledData = scalerModel.transform(fluxoDF) # Criando o modelo #rfClassifer = RandomForestClassifier(labelCol = "rotulo", featuresCol = "scaledFeatures", probabilityCol = "probability", numTrees=20) layers = [38, 5, 4, 2] mlpClassifer = MultilayerPerceptronClassifier(labelCol="rotulo", featuresCol="scaledFeatures", maxIter=100, layers=layers, blockSize=128, seed=1234) modelo = mlpClassifer.fit(scaledData)
outputCol="features") assembled_train = assembler.transform(train_data) assembled_train.select("features", "PSSM_central_1_I").show(truncate=False) training_set = assembled_train.select("features", "PSSM_central_1_I") #Split de los datos train_final, test_final = training_set.randomSplit([0.80, 0.20], seed=13) train_final.describe().show() test_final.describe().show() train_final = train_final.selectExpr("PSSM_central_1_I as label", "features as features") test_final = test_final.selectExpr("PSSM_central_1_I as label", "features as features") scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures") scalerModel = scaler.fit(train_final) scaledTData = scalerModel.transform(train_final) scaledTData = scaledTData.select("label", "scaledFeatures") scaledTData = scaledTData.selectExpr("label as label", "scaledFeatures as features") scalerModel = scaler.fit(test_final) scaledFData = scalerModel.transform(test_final) scaledFData = scaledFData.select("label", "scaledFeatures") scaledFData = scaledFData.selectExpr("label as label", "scaledFeatures as features") #Clasificador 2 nb = NaiveBayes(smoothing=1.0, modelType="multinomial")
# (u'Cops (1922)', 5.46740481439733) # We noticed that out top ranked movies have ratings higher than 5. This makes sense as there is no ceiling # implied in our algorithm and one can imagine that certain combinations of factors would combine to create # “better than anything you’ve seen yet” ratings. # Nevertheless, we may have to constrain our ratings to a 1-5 range. ### SCALE PREDICTED RATINGS WITHIN DEFINED BOUNDS new_user_recommendations_formatted_RDD_DF = new_user_recommendations_formatted_RDD.toDF( ['movie', "rating"]) to_vector = udf(lambda a: Vectors.dense(a), VectorUDT()) new_user_recommendations_formatted_RDD_DF = new_user_recommendations_formatted_RDD_DF.select( "movie", to_vector("rating").alias("rating")) scaler = MinMaxScaler(inputCol="rating", outputCol="scaled_rating", min=1, max=5) model = scaler.fit(new_user_recommendations_formatted_RDD_DF) new_user_recommendations_formatted_RDD_DF_scaled = model.transform( new_user_recommendations_formatted_RDD_DF) print("Features scaled to range: [%f, %f]" % (scaler.getMin(), scaler.getMax())) # Features scaled to range: [1.000000, 5.000000] new_user_recommendations_formatted_RDD_DF_scaled.select( "rating", "scaled_rating").show() # +--------------------+--------------------+ # | rating| scaled_rating| # +--------------------+--------------------+ # |[1.8833597779874536]| [2.810434087306585]| # |[2.4494414977308594]|[3.0641436235844264]|
"bars_confidence_max", "beats_confidence_max", "bars_start_max", "segments_confidence_max", "segments_loudness_max_time_max", "tatums_confidence_max", "bars_confidence_min", "beats_confidence_min", "bars_start_min", "segments_confidence_min", "segments_loudness_max_time_min", "tatums_confidence_min"] assembler = VectorAssembler(inputCols=columns, outputCol="raw_features").setHandleInvalid("skip") df_scale = assembler.transform(feature_selector).select('label', 'raw_features') # Most classifiers use some form of a distance calculation and each numeric feature tends to have different # ranges, some more broad than others. Scaling these features helps ensure that each feature’s contribution is # weighted proportionally. # https://albertdchiu.medium.com/a-step-by-step-example-in-binary-classification-5dac0f1ba2dd scaler = MinMaxScaler(inputCol="raw_features", outputCol="scaled_features") scalerModel = scaler.fit(df_scale) df_scale = scalerModel.transform(df_scale).select('label', 'scaled_features').persist( pyspark.StorageLevel.DISK_ONLY) print("\n\nSanity check counter ", df_scale.count()) total_count = df_scale.count() zero_counter = df_scale.filter(col('label') == 0).count() ones_counter = df_scale.filter(col('label') == 1).count() print("Count 1s :", ones_counter) print("Count 0s", zero_counter) print("Sanity check sum 1s and 0s", zero_counter + ones_counter) # Not the best weight method. The # if(zero_counter > ones_counter): # print("More zeros!")
f27 = f26.drop('Cat9_1') f28 = f27.drop('Cat10_1') f29 = f28.drop('Cat11_1') f30 = f29.drop('Cat12_1') f31 = f30.drop('NVCat_1') df2 = f31.selectExpr("Claim_Amount as label","feature as features") assembler1 = VectorAssembler( inputCols=["label"], outputCol="label1") output1 = assembler1.transform(df2) output1 = output1.cache() f32 = output1.drop("label") scaler = MinMaxScaler(inputCol="label1", outputCol="label") scalerModel = scaler.fit(f32) scaledData = scalerModel.transform(f32) element=udf(lambda v:float(v[0]),FloatType()) new = scaledData.withColumn('label', element('label')) (trainingData, testData) = new.randomSplit([0.7, 0.3], 50) lr = LinearRegression(featuresCol = "features", labelCol = "label", maxIter=10, regParam=0.3, elasticNetParam=0.8) lrModel = lr.fit(trainingData) trainingSummary = lrModel.summary print("Question 2.2(a).............") print("RMSE for training data: %f" % trainingSummary.rootMeanSquaredError) predict = lrModel.transform(testData) evaluator_rmse = RegressionEvaluator\ (labelCol="label", predictionCol="prediction", metricName="rmse") rmse = evaluator_rmse.evaluate(predict) print("RMSE for test data = %g " % rmse)
def scale_vec_col(self, columns, name_output_col): """ This function groups the columns specified and put them in a list array in one column, then a scale process is made. The scaling proccedure is spark scaling default (see the example bellow). +---------+----------+ |Price |AreaLiving| +---------+----------+ |1261706.9|16 | |1263607.9|16 | |1109960.0|19 | |978277.0 |19 | |885000.0 |19 | +---------+----------+ | | | V +----------------------------------------+ |['Price', 'AreaLiving'] | +----------------------------------------+ |[0.1673858972637624,0.5] | |[0.08966137157852398,0.3611111111111111]| |[0.11587093205757598,0.3888888888888889]| |[0.1139820728616421,0.3888888888888889] | |[0.12260126542983639,0.4722222222222222]| +----------------------------------------+ only showing top 5 rows """ # Check if columns argument must be a string or list datatype: self._assert_type_str_or_list(columns, "columns") # Check if columns to be process are in dataframe self._assert_cols_in_df(columns_provided=columns, columns_df=self._df.columns) # Check if name_output_col argument a string datatype: self._assert_type_str(name_output_col, "nameOutpuCol") # Model to use vectorAssember: vec_assembler = VectorAssembler(inputCols=columns, outputCol="features_assembler") # Model for scaling feature column: mm_scaler = MinMaxScaler(inputCol="features_assembler", outputCol=name_output_col) # Dataframe with feature_assembler column temp_df = vec_assembler.transform(self._df) # Fitting scaler model with transformed dataframe model = mm_scaler.fit(temp_df) exprs = list(filter(lambda x: x not in columns, self._df.columns)) exprs.extend([name_output_col]) self._df = model.transform(temp_df).select(*exprs) self._add_transformation() # checkpoint in case return self
spark = (SparkSession.builder.appName( "test app").enableHiveSupport().getOrCreate()) start_date = "2004-02-10 03:12:39" end_date = '2004-02-20 03:12:39' all_data = spark.sql("select * from demo.bearing where idx_date >= '%s' and idx_date < '%s'" \ % (start_date, end_date)) columns = all_data.columns # create scaled data tmp_data = all_data.rdd.map(lambda x: (x[0], Vectors.dense(x[1:]))).collect() scale_df = spark.createDataFrame(tmp_data, ['idx_date', '_features']) scaler = MinMaxScaler(inputCol="_features", outputCol="features") scalerModel = scaler.fit(scale_df) scaledData = scalerModel.transform(scale_df) train_data = scaledData.select( "idx_date", "features").filter("idx_date <= '2004-02-15 12:52:39'") test_data = scaledData.select("idx_date", "features").filter("idx_date >= '%s'" % start_date) \ .filter("idx_date <= '%s'" % end_date) iforest = IForest(contamination=0.1, maxFeatures=1.0, maxSamples=256, bootstrap=True) model = iforest.fit(train_data) model.hasSummary summary = model.summary
df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).show() # Normalize Data # Normalize columns from pyspark.ml.feature import MinMaxScaler from pyspark.ml.feature import VectorAssembler from pyspark.ml import Pipeline # MinMaxScaler Transformation assembler = VectorAssembler( inputCols=["ups", "downs", "authorlinkkarma", "authorkarma"], outputCol="vector").setParams(handleInvalid="skip") scaler = MinMaxScaler(min=0.0, max=1.0, inputCol="vector", outputCol="vector_scaled") pipeline = Pipeline(stages=[assembler, scaler]) scalerModel = pipeline.fit(df) scaledData = scalerModel.transform(df) #vector_scaled is out normalized data: vactorData = scaledData.select("vector", "vector_scaled") # when using azure databricks, use this call to visualize the data #display(scaledData.select("vector_scaled")) # ,Data Sampling for Experimentation df = spark.read.parquet(dbfs_mnt_processed + 'redditcomments/')
input_labelled_points = input.map(convert_to_labeled_point) print '**************Converted to labeled point************* \n', input_labelled_points.take( 5) ''' Part 3 - Choose two features and generate a heat map for each feature on grey scale and shows variation of each feature across 40 sample instances. - Normalize features between 0 and 1 with 1 representing darkest shade in heat map. Hint: https://spark.apache.org/docs/latest/ml-features.html#minmaxscaler ''' lines = input.map(lambda line: line.split(',')) transformed = lines.map(lambda line: (line[0], Vectors.dense(line[1:]))) labelled_dataframe = sqlContext.createDataFrame(transformed, ["label", "features"]) scalar = MinMaxScaler(inputCol="features", outputCol="features_scaled") scalar_mod = scalar.fit(labelled_dataframe.limit(40)) scaled_data = scalar_mod.transform(labelled_dataframe) print '******Scaled Features******* : \n', scaled_data.show(5, False) heat1 = np.asarray( labelled_dataframe.rdd.map( lambda f: (float(f.features[1]), float(f.features[2]))).take(40)) plt.imshow(heat1, cmap='gray') plt.show() heat2 = np.asarray( scaled_data.rdd.map(lambda f: (float(f.features[1]), float(f.features[2]))).take(40)) plt.imshow(heat2, cmap='gray') plt.show()
# Data preprocessing df = prepocess_data(df) # Calculating statistics global_active_power_stat = get_basic_statistics(df, "Global_active_power") global_reactive_power_stat = get_basic_statistics(df, "Global_reactive_power") voltage_stat = get_basic_statistics(df, "Voltage") global_intensity_stat = get_basic_statistics(df, "Global_intensity") # Calculating Min-max normalization assembler = VectorAssembler(inputCols=df.columns[0:], outputCol="features") df_2 = assembler.transform(df) scaler = MinMaxScaler(min=0, max=1, inputCol='features', outputCol='features_minmax') scaler_model = scaler.fit(df_2) df_3 = scaler_model.transform(df_2) # Transforming Dense vector to dataframe min_max_df = df_3.rdd.map( lambda x: [float(y) for y in x['features_minmax']]).toDF(df.columns[0:]) # create files and print print_statistics([ global_active_power_stat, global_reactive_power_stat, voltage_stat, global_intensity_stat ], min_max_df) # for local testing
print('Training dataset size: {}'.format(train_dataCount)) print('Validation dataset size: {}'.format(validationDataCount)) print('Test dataset size: {}'.format(test_dataCount)) print('Training + Validation + Test = {}'.format(train_dataCount + validationDataCount + test_dataCount)) #################################################################################### ## part 2 print('*' * 100) print('Part 2 - Train the model and evaluate on validation dataset \n') # data processing pipeline assembler = VectorAssembler(inputCols=features, outputCol='unscaledFeatures') minMaxScaler = MinMaxScaler(inputCol='unscaledFeatures', outputCol='features') stages = [assembler, minMaxScaler] pipeline = Pipeline(stages=stages) procPipeline = pipeline.fit(train_data) train_data = procPipeline.transform(train_data) validationData = procPipeline.transform(validationData) test_data = procPipeline.transform(test_data) train_data = train_data.select('label', 'features') validationData = validationData.select('label', 'features') test_data = test_data.select('label', 'features') # train model and evaluate on validation data lr = LinearRegression(maxIter=100) model = lr.fit(train_data)
# $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession \ .builder \ .appName("FMRegressorExample") \ .getOrCreate() # $example on$ # Load and parse the data file, converting it to a DataFrame. data = spark.read.format("libsvm").load( "data/mllib/sample_libsvm_data.txt") # Scale features. featureScaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures").fit(data) # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a FM model. fm = FMRegressor(featuresCol="scaledFeatures", stepSize=0.001) # Create a Pipeline. pipeline = Pipeline(stages=[featureScaler, fm]) # Train model. model = pipeline.fit(trainingData) # Make predictions. predictions = model.transform(testData)
inputCols=[input_column], outputCol=temp_vector_col, handleInvalid="skip" ).transform(df) temp_normalized_vector_col = temp_col_name(assembled) trained_parameters = load_trained_parameters( trained_parameters, {"input_column": input_column, "min": min, "max": max,} ) scaler_model, scaler_model_loaded = load_pyspark_model_from_trained_parameters( trained_parameters, MinMaxScalerModel, "scaler_model" ) if scaler_model is None: scaler = MinMaxScaler( inputCol=temp_vector_col, outputCol=temp_normalized_vector_col, min=parse_parameter(float, min, "min", 0.0), max=parse_parameter(float, max, "max", 1.0), ) scaler_model = fit_and_save_model(trained_parameters, "scaler_model", scaler, assembled_wo_nans) output_df = transform_using_trained_model(scaler_model, assembled, scaler_model_loaded) # convert the resulting vector back to numeric temp_flattened_vector_col = temp_col_name(output_df) output_df = output_df.withColumn(temp_flattened_vector_col, vector_to_array(temp_normalized_vector_col)) # keep only the final scaled column. output_column = input_column if output_column is None or not output_column else output_column output_column_value = sf.col(temp_flattened_vector_col)[0].alias(output_column) output_df = output_df.withColumn(output_column, output_column_value) final_columns = list(dict.fromkeys((list(df.columns) + [output_column])))
.getOrCreate() df = sql.read \ .format("csv") \ .option("sep", ",") \ .option("inferSchema", "true") \ .option("header", "true") \ .load(train_path) # Datetime dt_trans = DateColumns(inputCol="click_time") dt_ass = VectorAssembler(inputCols=dt_trans.getOutputColumns(), outputCol="dt_cols", handleInvalid="skip") dt_minmax = MinMaxScaler(inputCol="dt_cols", outputCol="dt_scaled") dt_pipeline = Pipeline(stages=[dt_trans, dt_ass, dt_minmax]) cond_cols = ["cond_app", "cond_device", "cond_os", "cond_channel"] cond_app = Conditional(inputCol=TARGET, groupByCol=["app"], outputCol="cond_app") cond_device = Conditional(inputCol=TARGET, groupByCol=["device"], outputCol="cond_device") cond_os = Conditional(inputCol=TARGET, groupByCol=["os"], outputCol="cond_os") cond_channel = Conditional(inputCol=TARGET, groupByCol=["channel"], outputCol="cond_channel")
from pyspark.ml.feature import MinMaxScaler from pyspark import SparkContext from pyspark.sql import SQLContext sc = SparkContext("local", "samp") sqlContext = SQLContext(sc) data = sqlContext.read.format("libsvm").load("D:\Spark\spark-1.6.1-bin-hadoop2.6\data\mllib\sample_libsvm_data.txt") indexer = MinMaxScaler(inputCol="features", outputCol="scaledFeatures") indexerData = indexer.fit(data) indexedData = indexerData.transform(data) indexedData.show() """OUTPUT +-----+--------------------+--------------------+ |label| features| scaledFeatures| +-----+--------------------+--------------------+ | 0.0|(692,[127,128,129...|[0.5,0.5,0.5,0.5,...| | 1.0|(692,[158,159,160...|[0.5,0.5,0.5,0.5,...| | 1.0|(692,[124,125,126...|[0.5,0.5,0.5,0.5,...| | 1.0|(692,[152,153,154...|[0.5,0.5,0.5,0.5,...| | 1.0|(692,[151,152,153...|[0.5,0.5,0.5,0.5,...| | 0.0|(692,[129,130,131...|[0.5,0.5,0.5,0.5,...| | 1.0|(692,[158,159,160...|[0.5,0.5,0.5,0.5,...| | 1.0|(692,[99,100,101,...|[0.5,0.5,0.5,0.5,...| | 0.0|(692,[154,155,156...|[0.5,0.5,0.5,0.5,...| | 0.0|(692,[127,128,129...|[0.5,0.5,0.5,0.5,...| | 1.0|(692,[154,155,156...|[0.5,0.5,0.5,0.5,...| | 0.0|(692,[153,154,155...|[0.5,0.5,0.5,0.5,...| | 0.0|(692,[151,152,153...|[0.5,0.5,0.5,0.5,...| | 1.0|(692,[129,130,131...|[0.5,0.5,0.5,0.5,...| | 0.0|(692,[154,155,156...|[0.5,0.5,0.5,0.5,...|
#df = scaler.transform(df) df.select(["id", "ScaledNumFeatures"]).where(df.Id == "512").collect() # # Question 4 # Using the StandardScaler method (scaling both the mean and the standard deviation) what's the normalized value for question Id = 512? # In[27]: scaler2 = StandardScaler(inputCol="TitleAndBodyLengthVector", outputCol="ScaledNumFeatures2", withStd=True) scalerModel = scaler2.fit(df) df = scalerModel.transform(df) df.select(["id", "ScaledNumFeatures2"]).where(df.Id == "512").collect() # # Question 5 # Using the MinMAxScaler method what's the normalized value for question Id = 512? # In[29]: from pyspark.ml.feature import MinMaxScaler scaler3 = MinMaxScaler(inputCol="TitleAndBodyLengthVector", outputCol="ScaledNumFeatures3") scalerModel3 = scaler3.fit(df) df = scalerModel3.transform(df) df.select(["id", "ScaledNumFeatures3"]).where(df.Id == "512").collect() # In[ ]:
#'ARRIVAL_TIME', 'ARRIVAL_DELAY', #'DIVERTED', #'CANCELLED', #'CANCELLATION_REASON', #'AIR_SYSTEM_DELAY', #'SECURITY_DELAY', #'AIRLINE_DELAY','LATE_AIRCRAFT_DELAY','WEATHER_DELAY' ], outputCol="features") #Normalizamos los features scaler = MinMaxScaler(inputCol="features", outputCol="scaled_features") #aplicamos pca para reducir dimensionalidad pca = PCA(k=7, inputCol="scaled_features", outputCol="pcaFeatures") from pyspark.ml.regression import RandomForestRegressor from pyspark.ml.tuning import CrossValidator, ParamGridBuilder from pyspark.ml.evaluation import RegressionEvaluator from pyspark.ml.regression import LinearRegression,DecisionTreeRegressor,GeneralizedLinearRegression #Hacemos un udf para generar un tipo diccionario de modelos y de Paramgridbuilders respectivamente y estos #se iteren con sus respectivos paraétros. def define_hyper_params(): #Creamos un diccionario de los modelos modelo = {'dt': DecisionTreeRegressor(featuresCol="pcaFeatures",labelCol='DEPARTURE_DELAY'),
# def quintile_agg(df_in,gr,colm): # qua=df_in.groupBy(gr).agg(*[mean(F.col(i)) for i in colm]).sort(F.col(gr)) # return qua # quintile_grouped = quintile_agg(df_input,grp,num_cols) # quintile_grouped.show(5) # # quintile_grouped.toPandas().to_csv('quintile_grouped.csv',index=False)#output_dir+'quintile_grouped.csv') ## prepare the data in vector dense from pyspark.ml.linalg import Vectors def transData(data): return data.rdd.map(lambda r: [r[0],Vectors.dense(r[1:])]).toDF(['CustomerID','rfm']) #Return a new RDD by applying a function to each element of this RDD. transformed=transData(rfm) transformed.show(5) ## normalization from pyspark.ml.feature import MinMaxScaler scaler = MinMaxScaler(inputCol="rfm",outputCol="features") scalerModel = scaler.fit(transformed) scaledData = scalerModel.transform(transformed) scaledData.show(5,False) # results will not be truncated scalerModel.save('filepath/scaling') ###ML ## find optimal parameter from pyspark.ml.clustering import KMeans cost = np.zeros(10) for k in range(2,10): kmeans = KMeans().setK(k)\ .setSeed(1) \ .setFeaturesCol("features")\ .setPredictionCol("cluster") model = kmeans.fit(scaledData) cost[k] = model.computeCost(scaledData)
def preprocess_data(self): rawDataDF = GetData().get_input_data() assembler = VectorAssembler(inputCols=["age"], outputCol="features") outputDF = assembler.transform(rawDataDF) outputDF = outputDF.drop('age') scaler = MinMaxScaler(inputCol="features", outputCol="scaled_age") scalerModel = scaler.fit(outputDF.select("features")) scaledDF = scalerModel.transform(outputDF) scaledDF = scaledDF.drop('features') udf1 = udf(lambda x: float(x[0]), FloatType()) scaledDF = scaledDF.withColumn("scaled_age", udf1(col('scaled_age'))) indexer = StringIndexer(inputCol="sex", outputCol="indexed_sex") indexedDF = indexer.fit(scaledDF).transform(scaledDF) indexedDF = indexedDF.drop('sex') indexer = StringIndexer(inputCol="address", outputCol="indexed_address") indexedDF = indexer.fit(indexedDF).transform(indexedDF) indexedDF = indexedDF.drop('address') indexer = StringIndexer(inputCol="Pstatus", outputCol="indexed_Pstatus") indexedDF = indexer.fit(indexedDF).transform(indexedDF) indexedDF = indexedDF.drop('Pstatus') indexer = StringIndexer(inputCol="famsize", outputCol="indexed_famsize") indexedDF = indexer.fit(indexedDF).transform(indexedDF) indexedDF = indexedDF.drop('famsize') indexer = StringIndexer(inputCol="guardian", outputCol="indexed_guardian") indexedDF = indexer.fit(indexedDF).transform(indexedDF) indexedDF = indexedDF.drop('guardian') indexer = StringIndexer(inputCol="schoolsup", outputCol="indexed_schoolsup") indexedDF = indexer.fit(indexedDF).transform(indexedDF) indexedDF = indexedDF.drop('schoolsup') indexer = StringIndexer(inputCol="famsup", outputCol="indexed_famsup") indexedDF = indexer.fit(indexedDF).transform(indexedDF) indexedDF = indexedDF.drop('famsup') indexer = StringIndexer(inputCol="romantic", outputCol="indexed_romantic") indexedDF = indexer.fit(indexedDF).transform(indexedDF) indexedDF = indexedDF.drop('romantic') indexer = StringIndexer(inputCol="internet", outputCol="indexed_internet") indexedDF = indexer.fit(indexedDF).transform(indexedDF) indexedDF = indexedDF.drop('internet') indexer = StringIndexer(inputCol="higher", outputCol="indexed_higher") indexedDF = indexer.fit(indexedDF).transform(indexedDF) indexedDF = indexedDF.drop('higher') indexer = StringIndexer(inputCol="nursery", outputCol="indexed_nursery") indexedDF = indexer.fit(indexedDF).transform(indexedDF) indexedDF = indexedDF.drop('nursery') indexer = StringIndexer(inputCol="activities", outputCol="indexed_activities") indexedDF = indexer.fit(indexedDF).transform(indexedDF) indexedDF = indexedDF.drop('activities') indexer = StringIndexer(inputCol="Mjob", outputCol="indexed_Mjob") indexedDF = indexer.fit(indexedDF).transform(indexedDF) indexedDF = indexedDF.drop('Mjob') indexer = StringIndexer(inputCol="Fjob", outputCol="indexed_Fjob") indexedDF = indexer.fit(indexedDF).transform(indexedDF) indexedDF = indexedDF.drop('Fjob') indexer = StringIndexer(inputCol="paid", outputCol="indexed_paid") indexedDF = indexer.fit(indexedDF).transform(indexedDF) indexedDF = indexedDF.drop('paid') indexedDF = indexedDF.drop("school", 'reason') return indexedDF
# limitations under the License. # from __future__ import print_function from pyspark import SparkContext from pyspark.sql import SQLContext # $example on$ from pyspark.ml.feature import MinMaxScaler # $example off$ if __name__ == "__main__": sc = SparkContext(appName="MinMaxScalerExample") sqlContext = SQLContext(sc) # $example on$ dataFrame = sqlContext.read.format("libsvm").load( "data/mllib/sample_libsvm_data.txt") scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures") # Compute summary statistics and generate MinMaxScalerModel scalerModel = scaler.fit(dataFrame) # rescale each feature to range [min, max]. scaledData = scalerModel.transform(dataFrame) scaledData.show() # $example off$ sc.stop()
def __init__(self, inputCol, outputCol, s_min=0, s_max=0): self.mmModel = MinMaxScaler(inputCol=inputCol, outputCol=outputCol) self.mmModel.setMin(s_min) self.mmModel.setMax(s_max) self.in_column = inputCol
# See the License for the specific language governing permissions and # limitations under the License. # from __future__ import print_function from pyspark import SparkContext from pyspark.sql import SQLContext # $example on$ from pyspark.ml.feature import MinMaxScaler # $example off$ if __name__ == "__main__": sc = SparkContext(appName="MinMaxScalerExample") sqlContext = SQLContext(sc) # $example on$ dataFrame = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures") # Compute summary statistics and generate MinMaxScalerModel scalerModel = scaler.fit(dataFrame) # rescale each feature to range [min, max]. scaledData = scalerModel.transform(dataFrame) scaledData.show() # $example off$ sc.stop()
# Cambiamos en nombre de la columna objetivo y agrupamos las columnas de caracteristicas en una sola columna de arrays. train_set = train_set.withColumnRenamed("_c631", "label") assembler = VectorAssembler( inputCols=['_c186', '_c245', '_c459', '_c221', '_c490', '_c429'], outputCol='features') train_set = assembler.transform(train_set) # Lo mismo para el conjunto de test test_set = test_set.withColumnRenamed("_c631", "label") assembler = VectorAssembler( inputCols=['_c186', '_c245', '_c459', '_c221', '_c490', '_c429'], outputCol='features') test_set = assembler.transform(test_set) # Añadimos una columna con las caracteristicas escaladas entre 0 y 1 scaler = MinMaxScaler(inputCol="features", outputCol="scaled_features") scalerModelTrain = scaler.fit(train_set) train_set = scalerModelTrain.transform(train_set) scalerModelTest = scaler.fit(test_set) test_set = scalerModelTrain.transform(test_set) ###### Entrenamiento de los modelos ### Regresion logistica 1 # Entrenamiento lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, family="binomial") lrModel_1 = lr.fit(train_set) # Curva ROC
from pyspark.ml.feature import MinMaxScaler from pyspark.ml.linalg import Vectors # $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("MinMaxScalerExample")\ .getOrCreate() # $example on$ dataFrame = spark.createDataFrame([ (0, Vectors.dense([1.0, 0.1, -1.0]),), (1, Vectors.dense([2.0, 1.1, 1.0]),), (2, Vectors.dense([3.0, 10.1, 3.0]),) ], ["id", "features"]) scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures") # Compute summary statistics and generate MinMaxScalerModel scalerModel = scaler.fit(dataFrame) # rescale each feature to range [min, max]. scaledData = scalerModel.transform(dataFrame) print("Features scaled to range: [%f, %f]" % (scaler.getMin(), scaler.getMax())) scaledData.select("features", "scaledFeatures").show() # $example off$ spark.stop()
bucketer = QuantileDiscretizer().setNumBuckets(5).setInputCol("id") fittedBucketer = bucketer.fit(contDF) fittedBucketer.transform(contDF).show() # COMMAND ---------- from pyspark.ml.feature import StandardScaler sScaler = StandardScaler().setInputCol("features") sScaler.fit(scaleDF).transform(scaleDF).show() # COMMAND ---------- from pyspark.ml.feature import MinMaxScaler minMax = MinMaxScaler().setMin(5).setMax(10).setInputCol("features") fittedminMax = minMax.fit(scaleDF) fittedminMax.transform(scaleDF).show() # COMMAND ---------- from pyspark.ml.feature import MaxAbsScaler maScaler = MaxAbsScaler().setInputCol("features") fittedmaScaler = maScaler.fit(scaleDF) fittedmaScaler.transform(scaleDF).show() # COMMAND ---------- from pyspark.ml.feature import ElementwiseProduct