def rescale_df(data): """Rescale the data.""" standardScaler = StandardScaler(inputCol="features", outputCol="features_scaled") scaler = standardScaler.fit(data) scaled_df = scaler.transform(data) return scaled_df
def createPipeline(irisData, lrElasticNetParam, lrRegParam): '''Creates a pipeline for coverting the data into features and label with the required format Args: irisData - Input data for the feature and label processing lrElasticNetParam - ElasticNet parameter of LR, 0-L2 penalty and 1-L1 penalty lrRegParam - Regularization parameter ''' strIndexer = StringIndexer().setInputCol('species').setOutputCol( 'label').fit(irisData) va = VectorAssembler(inputCols=[ 'sepal_length', 'sepal_width', 'petal_length', 'petal_width' ], outputCol='vec_features') ss = StandardScaler().setInputCol( va.getOutputCol()).setOutputCol('features').fit(va.transform(irisData)) lr = LogisticRegression().setFeaturesCol('features') labelConverter = IndexToString(inputCol='prediction', outputCol='predictedLabel', labels=strIndexer.labels) stages = [strIndexer, va, ss, lr, labelConverter] pipeline = Pipeline().setStages(stages) params = ParamGridBuilder().addGrid(lr.elasticNetParam, lrElasticNetParam).addGrid( lr.regParam, lrRegParam).build() evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName=lrMetric) return pipeline, params, evaluator
def normalize_score(df): from pyspark.ml.linalg import Vectors from pyspark.ml.feature import VectorAssembler from pyspark.ml.feature import StandardScaler assembler = VectorAssembler( inputCols=["score"], outputCol="score_v") output = assembler.transform(df) # Normalize each Vector using $L^1$ norm. scaler = StandardScaler(inputCol="score_v", outputCol="popularity_score", withStd=False, withMean=True) # Compute summary statistics by fitting the StandardScaler scalerModel = scaler.fit(output) # Normalize each feature to have unit standard deviation. scaledData = scalerModel.transform(output) return scaledData
def train(cls, spark, sdf, cat_colnames, num_colnames): string_indexer_list = list() for cat_colname in cat_colnames: string_indexer = StringIndexer(inputCol=cat_colname, outputCol=cat_colname + "_index", handleInvalid="skip") string_indexer_list.append(string_indexer) out = [] pipe = [] if len(num_colnames) > 0: assembler = VectorAssembler(inputCols=num_colnames, outputCol="features_vec") standard_scaler = StandardScaler(inputCol="features_vec", outputCol="features_zs", withMean=True, withStd=True) out = [standard_scaler.getOutputCol()] pipe = [assembler, standard_scaler] assembler_2 = VectorAssembler( inputCols=[x.getOutputCol() for x in string_indexer_list] + out, outputCol="features") estimator = KMeans(featuresCol="features", predictionCol="cluster_id", k=4) clustering_pipeline = Pipeline(stages=string_indexer_list + pipe + [assembler_2] + [estimator]) clustering_pipeline_model = clustering_pipeline.fit(sdf) return KMeansPipeline(pipeline_model=clustering_pipeline_model)
def __init__(self, file_name, spark_context, maxIter=100, regParam=0.0, tol=1e-6, threshold=0.0, aggregationDepth=2): self.sqlContext = SQLContext(spark_context) self.spark_context = spark_context self.data = self.sqlContext.read.options(header='true', inferschema='true', delimiter=',').csv(file_name) self.data.cache() self.lr_data = self.data.select(col("Class").alias("label"), *features) vectorAssembler = VectorAssembler(inputCols=features, outputCol="unscaled_features") standardScaler = StandardScaler(inputCol="unscaled_features", outputCol="features") self.settings = [('maxIter',maxIter), ('regParam',regParam), ('tol',tol), ('threshold',threshold),('aggregationDepth',aggregationDepth)] self.SVM = LinearSVC(maxIter=maxIter, regParam=regParam, tol=tol, threshold=threshold, aggregationDepth=aggregationDepth) stages = [vectorAssembler, standardScaler, self.SVM] pipeline = Pipeline(stages=stages) self.model = pipeline.fit(self.lr_data)
def df_train_test(): df_train = spark.read.parquet(os.path.join("datasets", "train.vector.parquet")) df_test = spark.read.parquet(os.path.join("datasets", "test.vector.parquet")) cols = ["vx"+str(i) for i in range(10)] assembler = VectorAssembler(inputCols=cols, outputCol="vx_t1") dct = DCT(inverse=False, inputCol="vx_t1", outputCol="vx_t2") slicer = VectorSlicer(inputCol="vx_t2", outputCol="vx_t3", indices=[i for i in range(40000)]) scaler = StandardScaler(inputCol="vx_t3", outputCol="vx", withStd=True, withMean=False) pipeline = Pipeline(stages=[assembler, dct, slicer, scaler]) p_model = pipeline.fit(df_train) df_train = p_model.transform(df_train) df_train = df_train.drop("vx0").drop("vx1").drop("vx2").drop("vx3").drop("vx4") df_train = df_train.drop("vx5").drop("vx6").drop("vx7").drop("vx8").drop("vx9") df_train = df_train.drop("vx_t1").drop("vx_t2").drop("vx_t3") df_test = p_model.transform(df_test) df_test = df_test.drop("vx0").drop("vx1").drop("vx2").drop("vx3").drop("vx4") df_test = df_test.drop("vx5").drop("vx6").drop("vx7").drop("vx8").drop("vx9") df_test = df_test.drop("vx_t1").drop("vx_t2").drop("vx_t3") df_train.write.mode("overwrite").parquet(os.path.join("datasets", "train.vector.dct.parquet")) df_test.write.mode("overwrite").parquet(os.path.join("datasets", "test.vector.dct.parquet")) df_train.printSchema() df_test.printSchema()
def create_standard_pipeline(self, cross_validate=False): """ This method creates a standard pipeline, standard meaning: vectorize, standardize and model... :return: Pipeline for pyspark, ParameterGrid for Pyspark pipeline """ # Feature columns are created from instance variables # feature_columns = [i.name for i in self._feature_cols] # Vectorized transformation vectorizer = VectorAssembler(inputCols=self._feature_cols, outputCol='v_features') # Cast the vector from mllib to ml converter = ConvertAllToVecToMl(inputCol=vectorizer.getOutputCol(), outputCol='casted') # Standardize estimator standardizes = StandardScaler(withMean=self._standardize, withStd=self._standardize, inputCol=converter.getOutputCol(), outputCol="scaled") # Labels and strings are already set into the model, + dict_parameters = dict( filter(lambda x: not isinstance(x[1], tuple), self._params.items())) dict_parameters['featuresCol'] = standardizes.getOutputCol() dict_parameters['labelCol'] = self._label_col[0] # HACK!!! #print(label_dict) # Model is set model = eval("classification." + self._algorithm)(**dict_parameters) pipe = Pipeline(stages=[vectorizer, converter, standardizes, model]) return pipe
def train(self, df): df = self.build_features_vectors(df) scaler = StandardScaler() scaler.setInputCol(self.features_values_column) scaler.setOutputCol(self.features_values_scaled) scaler.setWithMean(self.scaler_with_mean) scaler.setWithStd(self.scaler_with_std) self.scaler_model = scaler.fit(df) df = self.scaler_model.transform(df).persist( StorageLevelFactory.get_storage_level(self.storage_level)) if len(self.categorical_features): self._create_indexes(df) self._add_categorical_features(df, self.features_values_scaled) iforest = IForest( featuresCol=self.features_values_scaled, predictionCol=self.prediction_column, # anomalyScore=self.score_column, numTrees=self.num_trees, maxSamples=self.max_samples, maxFeatures=self.max_features, maxDepth=self.max_depth, contamination=self.contamination, bootstrap=self.bootstrap, approxQuantileRelativeError=self. approximate_quantile_relative_error, # numCategoricalFeatures=len(self.categorical_features) ) iforest.setSeed(self.seed) params = {'threshold': self.threshold} self.iforest_model = iforest.fit(df, params) df.unpersist()
def standardScaler(self): from pyspark.ml.feature import StandardScaler dataFrame = self.session.read.format("libsvm").load( self.dataDir + "/data/mllib/sample_libsvm_data.txt") scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=False) scalerModel = scaler.fit(dataFrame) scaledData = scalerModel.transform(dataFrame) scaledData.show() scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures") # Compute summary statistics and generate MinMaxScalerModel scalerModel = scaler.fit(dataFrame) # rescale each feature to range [min, max]. scaledData = scalerModel.transform(dataFrame) print("Features scaled to range: [%f, %f]" % (scaler.getMin(), scaler.getMax())) scaledData.select("features", "scaledFeatures").show() scaler = MaxAbsScaler(inputCol="features", outputCol="scaledFeatures") # Compute summary statistics and generate MaxAbsScalerModel scalerModel = scaler.fit(dataFrame) # rescale each feature to range [-1, 1]. scaledData = scalerModel.transform(dataFrame) scaledData.select("features", "scaledFeatures").show()
def build_standard_scaler(self, df, with_mean=False, with_std=True, persist_estimator_path=None, input_col='features', output_col='scaled_features'): """ Standard Scaler estimator builder and transformer for dense feature vectors. Warnings: It will build a dense output, so take care when applying to sparse input. :param df: Spark DataFrame object auto-reference from DataFrame class :param with_mean: False by default. Centers the data with mean before scaling :param with_std: True by default. Scales the data to unit standard deviation :param persist_estimator_path: Persist model estimator metadata path :param input_col: Name for input column to scale :param output_col: Name of output column to create with scaled features :return: Standard Scaler model """ std_scaler = StandardScaler(withMean=with_mean, withStd=with_std, inputCol=input_col, outputCol=output_col) if persist_estimator_path: self.__logger.info("Compute Feature Standard ScalerModel Metadata") self.__logger.warning( f"Persist Metadata Model Path: {persist_estimator_path}") std_scaler.fit(df).write().overwrite().save(persist_estimator_path) self.__logger.info("Loading Scaler Estimator For Prediction") return StandardScalerModel.load(persist_estimator_path).tansfrom( df) self.__logger.info("Compute Feature Standard Scaler DataFrame") return std_scaler.fit(df).transform(df)
def main(spark, data_file, model_file): '''Main routine for unsupervised training Parameters ---------- spark : SparkSession object data_file : string, path to the parquet file to load model_file : string, path to store the serialized model file ''' DF = spark.read.parquet(data_file) DF=DF.select("mfcc_00", "mfcc_01", "mfcc_02","mfcc_03","mfcc_04","mfcc_05","mfcc_06","mfcc_07","mfcc_08","mfcc_09","mfcc_10", "mfcc_11", "mfcc_12","mfcc_13","mfcc_14","mfcc_15","mfcc_16","mfcc_17","mfcc_18","mfcc_19") assembler = VectorAssembler( inputCols=["mfcc_00", "mfcc_01", "mfcc_02","mfcc_03","mfcc_04","mfcc_05","mfcc_06","mfcc_07","mfcc_08","mfcc_09","mfcc_10", "mfcc_11", "mfcc_12","mfcc_13","mfcc_14","mfcc_15","mfcc_16","mfcc_17","mfcc_18","mfcc_19"], outputCol="features") #DF = assembler.transform(DF) scaler = StandardScaler(inputCol="features", outputCol="features_scaled", withStd=True, withMean=False) #scalerModel = scaler.fit(DF) #DFnew = scalerModel.transform(DF) kmeans = KMeans().setK(100).setSeed(1) ### # TODO: Enter your code ### pipeline = Pipeline(stages=[assembler,scaler,kmeans]) model=pipeline.fit(DF) model.save(model_file) pass
def preprocess(df, should_undersample, scaler=None): """ Escala los datos y balancea usando Random Undersample (RUS) """ # Agrupar las caracteristicas para poder usarlas en la MLlib: assembler = VectorAssembler(inputCols=[ "PSSM_r1_1_K", "PSSM_r2_-1_R", "PSSM_central_2_D", "PSSM_central_0_A", "PSSM_r1_1_W", "PSSM_central_-1_V" ], outputCol="features") out = assembler.transform(df).select("features", "class") # Random Undersample (RUS) # Antes: POS = 550.140, NEG = 1.100.591 # Despues: POS = 550.140, NEG = 549.668 if should_undersample: positive = out.filter(out["class"] == 1.0) negative = out.filter(out["class"] == 0.0) fraction = float(positive.count()) / float(negative.count()) negative = negative.sample(withReplacement=False, fraction=fraction, seed=89) out = negative.union(positive) # Escalar: if scaler == None: scaler = StandardScaler(withMean=True, withStd=True, inputCol="features", outputCol="scaled_features") scaler = scaler.fit(out) out = scaler.transform(out) else: out = scaler.transform(out) return out, scaler
def make_pipeline(spark_df): for c in spark_df.columns: spark_df = spark_df.withColumn(c, spark_df[c].cast("float")) stages= [] cols = ['acc_now_delinq', 'acc_open_past_24mths', 'annual_inc', 'avg_cur_bal', 'funded_amnt'] #Assembling mixed data type transformations: assembler = VectorAssembler(inputCols=cols, outputCol="features") stages += [assembler] #Scaling features scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=True) stages += [scaler] #Logistic Regression lr = LogisticRegression(featuresCol='scaledFeatures', labelCol='is_default', maxIter=10, regParam=0.1, elasticNetParam=0.1) stages += [lr] #Creating and running the pipeline: pipeline = Pipeline(stages=stages) pipelineModel = pipeline.fit(spark_df) return pipelineModel
def prepare_data(): """Commodity function to read the data from the files and prepare the features for the kmeans model fit. """ # Read data from files. _data = load_data() # As the distribution of the following feature is not normal they will be log scaled to have a more # normally distributed distribution. This is required for kmeans algorithm to work better. _data = _data.withColumn('log_age', F.log('age')).withColumn('log_avg_buy', F.log('avg_buy'))\ .withColumn('log_min_buy', F.log('min_buy')).withColumn('log_max_buy', F.log('max_buy')) # Select the features to use in kmeans. The features will be also standard scaled, that is mean centered # and scaled to have standard deviation of one. features = _data.columns[4:] assembler = VectorAssembler(inputCols=features, outputCol='features_unscaled') assembled = assembler.transform(_data) scaler = StandardScaler(inputCol='features_unscaled', outputCol='features', withStd=True, withMean=True) scaler_model = scaler.fit(assembled) scaled_data = scaler_model.transform(assembled) return scaled_data, features
def test_standard_scaler(self): data = self.spark.createDataFrame([( 0, Vectors.dense([1.0, 0.1, -1.0]), ), ( 1, Vectors.dense([2.0, 1.1, 1.0]), ), ( 2, Vectors.dense([3.0, 10.1, 3.0]), )], ["id", "features"]) scaler = StandardScaler(inputCol='features', outputCol='scaled_features') model = scaler.fit(data) # the input names must match the inputCol(s) above model_onnx = convert_sparkml(model, 'Sparkml StandardScaler', [('features', FloatTensorType([1, 3]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = predicted.toPandas().scaled_features.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) data_np = data.toPandas().features.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlStandardScaler") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['scaled_features'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def z_score_encoder(self, column_names, mean=True, sd=True): """ Normalize features in column_names Args: column_names: List(String), feature names for scale columns mean: Boolean, default True sd: Boolean, default True Returns: list of pipeline stage """ stages = list() footer = self._z_score_footer feature_name = [(name, name + footer) for name in column_names] for name in feature_name: inner_name = name[0] output_name = name[1] vector = VectorAssembler().setInputCols([inner_name])\ .setOutputCol(inner_name + "_vector") z_score = StandardScaler().setInputCol(inner_name+"_vector")\ .setOutputCol(output_name)\ .setWithMean(mean)\ .setWithStd(sd) stages.extend([vector, z_score]) return stages
def prepare_spark_pipeline_for_DT(): print('----------Preparing spark pipeline for DT----------') label_indexer = StringIndexer(inputCol="price", outputCol="label", handleInvalid="keep") vector_assembler = VectorAssembler(inputCols=features, outputCol="unscaled_features") standard_scaler = StandardScaler(inputCol="unscaled_features", outputCol="features") DT_model = DecisionTreeRegressor(maxDepth=8) stages = [label_indexer, vector_assembler, standard_scaler, DT_model] pipeline = Pipeline(stages=stages) estimator_param = ParamGridBuilder().addGrid(DT_model.maxDepth, [8, 16]).addGrid( DT_model.impurity, ["variance"]).build() eval = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="mse") return CrossValidator(estimator=pipeline, estimatorParamMaps=estimator_param, evaluator=eval, numFolds=3), eval
def test_build_pipeline_by_dtypes_with_obvious_transformers( spark, string_transformers, number_transformers): df = spark.createDataFrame( [("Benjamin", 178.1, 0), ("Omar", 178.1, 1)], schema=["name", "height", "target"], ) pipeline = pipeasy_spark.build_pipeline_by_dtypes( df, exclude_columns=["target"], string_transformers=[StringIndexer(), OneHotEncoderEstimator()], numeric_transformers=[VectorAssembler(), StandardScaler()], ) trained_pipeline = pipeline.fit(df) df_transformed = trained_pipeline.transform(df) def get_target_values(dataframe): return dataframe.rdd.map(lambda row: row["target"]).collect() assert set(df.columns) == set(df_transformed.columns) assert df.count() == df_transformed.count() assert get_target_values(df) == get_target_values(df_transformed)
def generate_train_test(data, multiplier_minority, fraction_majority, label_col='label', minority_tag=1, train_perc=0.7): ''' Train test split on the data (after the step of features assembling) multiplier_minority: how many small proportions do we want of the minority data fraction_majority: sample fraction for majority group label_col: column name of the label column minority_tag: tag that has very few representatives train_perc: how many percentages of the data will go to the training set ''' po = data.filter("{} == {}".format(label_col, minority_tag)) ne = data.filter("{} != {}".format(label_col, minority_tag)) training_po, testing_po = po.randomSplit([train_perc, 1-train_perc], seed = 100) training_ne, testing_ne = ne.randomSplit([train_perc, 1-train_perc], seed = 100) training = training_po.union(training_ne) training = resample(training, multiplier_minority=multiplier_minority, fraction_majority=fraction_majority) testing = testing_po.union(testing_ne) scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=False) scale_model = scaler.fit(training) training = scale_model.transform(training) testing = scale_model.transform(testing) return training, testing
def __init__(self, file_name, spark_context, smoothing=1.0, modelType="multinomial"): self.sqlContext = SQLContext(spark_context) self.spark_context = spark_context self.data = self.sqlContext.read.options(header='true', inferschema='true', delimiter=',').csv(file_name) self.data.cache() self.settings = [('smoothing', smoothing), ('modelType', modelType)] self.lr_data = self.data.select(col("Class").alias("label"), *features) vectorAssembler = VectorAssembler(inputCols=features, outputCol="unscaled_features") standardScaler = StandardScaler(inputCol="unscaled_features", outputCol="features") self.nb = NaiveBayes(smoothing=smoothing, modelType=modelType) stages = [vectorAssembler, standardScaler, self.nb] pipeline = Pipeline(stages=stages) self.model = pipeline.fit(self.lr_data)
def main(spark, data_file, model_file): '''Main routine for unsupervised training Parameters ---------- spark : SparkSession object data_file : string, path to the parquet file to load model_file : string, path to store the serialized model file ''' ### # TODO: YOUR CODE GOES HERE # Load the dataframe df = spark.read.parquet(data_file) #select out the 20 attribute columns labeled mfcc_00, mfcc_01, ..., mfcc_19 features = [c for c in df.columns if c[:4]=='mfcc'] assembler= VectorAssembler(inputCols=features,outputCol='features') standardScaler = StandardScaler(inputCol="features", outputCol="features_scaled") kmeans = KMeans().setK(100).setFeaturesCol("features_scaled") # Build the pipeline with our assembler, standardScaler, and Kmeans stages pipeline = Pipeline(stages=[assembler, standardScaler, kmeans]) model = pipeline.fit(df) result = model.transform(df) model.write().overwrite().save(model_file)
def spark_data_flow(): input_df = spark.read.parquet( ("{path}/" "p2p_feature_merge/" "{version}").format(path=IN_PAHT, version=RELATION_VERSION)) tid_vector_df = input_df.rdd.map( get_vectors ).toDF( ).withColumnRenamed( '_1', 'features' ).withColumnRenamed( '_2', 'bbd_qyxx_id' ).withColumnRenamed( '_3', 'company_name' ).withColumnRenamed( '_4', 'platform_name' ).withColumnRenamed( '_5', 'platform_state' ) scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=True) # Compute summary statistics by fitting the StandardScaler scalerModel = scaler.fit(tid_vector_df) # Normalize each feature to have unit standard deviation. scaled_df = scalerModel.transform(tid_vector_df) return scaled_df
def vectorize_data(training_data, test_data): # Assemble the vectors input_columns = training_data.columns input_columns.remove(TARGET) print("Using these features: {}".format(input_columns)) vector_assembler = VectorAssembler(inputCols=input_columns, outputCol='features') train_df = vector_assembler.transform(training_data) # Normalize the data using Scalar scalar = StandardScaler(inputCol='features', outputCol='scaledFeatures', withStd=True, withMean=True).fit(train_df) train_df = scalar.transform(train_df) # Select the rows needed train_df = train_df.select(['scaledFeatures', TARGET]) new_test_data = dict() for company in test_data: company_data = test_data[company] test_df = vector_assembler.transform(company_data) test_df = scalar.transform(test_df) test_df = test_df.select(['scaledFeatures', TARGET]) new_test_data[company] = test_df return train_df, new_test_data
def fit_eval_model(data, classifier, seed=0): train, test = split_data(data, seed) scaler = StandardScaler(inputCol='NumFeatures', outputCol='features') pipeline = create_pipeline(data, classifier, scaler) model = pipeline.fit(train) calc_metrics(data, model, test) return model
def Featurizer(categorical): normalizer = StandardScaler(inputCol='numerical', outputCol='numerical_norm') label_encoder = [StringIndexer(inputCol = col, outputCol = col + '_label', handleInvalid = 'keep')\ for col in categorical] one_hot_encoder = OneHotEncoderEstimator(\ inputCols = [c + '_label' for c in categorical], outputCols = [c + '_encod' for c in categorical], handleInvalid = 'keep' ) assemblerInputs = ['numerical_norm']\ + [c + "_encod" for c in categorical] assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features") featurizer = Pipeline(stages = \ [normalizer] + \ label_encoder + \ [one_hot_encoder] + \ [assembler] ) return featurizer
def normalizationBySpark(RDDSparkDF): scalerSD = StandardScaler(inputCol="features", outputCol="scaledFeatures",withStd=True, withMean=False) #scalerMaxMin = MinMaxScaler(inputCol="features", outputCol="scaledFeatures") #scalerMaxAbs = MaxAbsScaler(inputCol="features", outputCol="scaledFeatures") # Compute summary statistics and generate MaxAbsScalerModel scalerSDModel = scalerSD.fit(RDDSparkDF) #scalerMaxMinModel = scalerMaxMin.fit(RDDSparkDF) #scalerMaxAbsModel = scalerMaxAbs.fit(RDDSparkDF) # rescale each feature to range [-1, 1]. scaledDataSD = scalerSDModel.transform(RDDSparkDF) #scaledDataMinMax = scalerMaxMinModel.transform(RDDSparkDF) #scaledDataMaxAbs = scalerMaxAbsModel.transform(RDDSparkDF) #Compute summary statistics by fitting the StandardScaler #Compute summary statistics and generate MinMaxScalerModel #print("Features scaled by SD to range: [%f, %f]" % (scaledDataSD.getMin(), scaledDataSD.getMax())) #print("Features scaled by MinMax to range: [%f, %f]" % (scaledDataMinMax.getMin(), scaledDataMinMax.getMax())) scaledFeatures_outcome = scaledDataSD.rdd.map(extractRow).toDF(newColumns) leftDF = RDDSparkDF.select(col2) df = leftDF.join(scaledFeatures_outcome, ["KEY"]) #return scaledDataSD, scaledDataMinMax,scaledDataMaxAbs,df return df
def build_scaled_features_pipeline(categorical_columns): pipeline_stages = [] # encode the columns for col in categorical_columns: indexer = StringIndexer(inputCol=col, outputCol=col + '_idx') pipeline_stages += [indexer] # build feature vector features = [c + "_idx" for c in categorical_columns] assembler = VectorAssembler(inputCols=features, outputCol="feature_vec") pipeline_stages += [assembler] # scale the features scaler = StandardScaler(inputCol="feature_vec", outputCol="features", withStd=True, withMean=False) pipeline_stages += [scaler] # add a new label column but this time # label_string_indexer = StringIndexer(inputCol='label', outputCol='label_idx') # pipeline_stages += [label_string_indexer] return Pipeline(stages=pipeline_stages)
def fit(self, sdf): """ :param sdf: :return: """ if self.weighter is None: raise NotImplementedError( "The weighter parameter has not been defined.") weights_arr = self.weighter.get_feature_importances(sdf) pipeline_lst = [ VectorAssembler(inputCols=self.input_cols, outputCol="vec"), StandardScaler(inputCol="vec", outputCol="standard_vec"), ElementwiseProduct(scalingVec=weights_arr, inputCol='standard_vec', outputCol='scaled_vec') ] _model = Pipeline(stages=pipeline_lst) model = _model.fit(sdf) self.model = model return self
def lr_history_data(df_buildFeatures): categary_features = ['org', 'dst', 'isReturn_type', 'isDirect_type', 'departYear', 'is_vacation', 'vacation_days', 'day_of_vacation'] stringIndexer_stages = [] onehotEncoder_stages = [] for cateIndexer in categary_features: stringIndexer = StringIndexer(inputCol=cateIndexer, outputCol='stringIndexer_'+cateIndexer) onehotEncoder = OneHotEncoder(inputCol=stringIndexer.getOutputCol(), outputCol='onehotEncoder_'+cateIndexer, dropLast=False) stringIndexer_stages.append(stringIndexer) onehotEncoder_stages.append(onehotEncoder) scaler_features = ['departMonth', 'depart_dayofmonth', 'depart_weekofyear', 'depart_dayofweek', 'departQuarter',\ 'intervalDays', 'intervalMonths', 'intervalWeeks', 'intervalQuarters', 'preceding3day_price'] scaler_assembler = VectorAssembler(inputCols=scaler_features, outputCol='scalerAssembler') scaler = StandardScaler(inputCol='scalerAssembler', outputCol='scalerFeatures', withMean=True, withStd=True) features = [] for cateIndexer in categary_features: features.append(("onehotEncoder_"+cateIndexer)) features.append('scalerFeatures') featrues_assembler = VectorAssembler(inputCols=features, outputCol='features') stages = stringIndexer_stages + onehotEncoder_stages + [scaler_assembler, scaler, featrues_assembler] pipeline = Pipeline(stages=stages) pipelineModel = pipeline.fit(df_buildFeatures) if hdfs.exists('/predict-2019/pipelineModel_' + todayStr1): hdfs.rm('/predict-2019/pipelineModel_' + todayStr1) pipelineModel.save('hdfs://10.0.1.218:9000/predict-2019/pipelineModel_' + todayStr1) lr_historyData = pipelineModel.transform(df_buildFeatures)\ .select('predictDate', 'departDate', 'flineId_noDate', 'price', 'features') # if hdfs.exists('/predict-2019/lr_data_20190624.parquet'): # hdfs.rm('/predict-2019/lr_data_20190624.parquet') # lr_data.write.save('hdfs://10.0.1.218:9000/predict-2019/lr_data_20190624.parquet') return lr_historyData
def train_new_feature_pipeline(df: DataFrame, degree: int = 3) -> PipelineModel: """Create a new feature pipeline and fit to training data :param df: raw Iris spark sql data frame :type df: DataFrame :param degree: degree of polynomial feature expansion :type degree: int :returns: fitted feature pipeline :rtype: PipelineModel """ assembler = VectorAssembler( inputCols=[ "sepal_length_cm", "sepal_width_cm", "petal_length_cm", "petal_width_cm", ], outputCol="features", ) scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=True) polyExpansion = PolynomialExpansion(degree=degree, inputCol="scaledFeatures", outputCol="polyFeatures") pipeline = Pipeline(stages=[assembler, scaler, polyExpansion]) pipeline_model = pipeline.fit(df) return pipeline_model