def get_ml1_pipeline(): stages = [] imputer = Imputer(inputCols=ML1_NUMERICAL_COLUMNS , outputCols=ML1_NUMERICAL_COLUMNS ) stages.append(imputer) ohe_input_cols = [] ohe_output_cols = [] for categorical_column in ML1_CATEGORICAL_COLUMNS: str_indexer = StringIndexer(inputCol=categorical_column, outputCol=categorical_column + "_index", handleInvalid='keep') ohe_input_cols.append(str_indexer.getOutputCol()) ohe_output_cols.append(categorical_column + "_class_vec") stages.append(str_indexer) encoder = OneHotEncoderEstimator(inputCols=ohe_input_cols, outputCols=ohe_output_cols, handleInvalid="error", dropLast=False) stages.append(encoder) numerical_vector_assembler = VectorAssembler(inputCols=ML1_NUMERICAL_COLUMNS , outputCol="numerial_cols_vec", handleInvalid="keep") scaler = MinMaxScaler(inputCol="numerial_cols_vec", outputCol= "scaled_numerical_cols") stages.append(numerical_vector_assembler) stages.append(scaler) label_str_indexer = StringIndexer(inputCol="result", outputCol="label", handleInvalid="keep") stages.append(label_str_indexer) assembler_input = encoder.getOutputCols() + [scaler.getOutputCol()] assembler = VectorAssembler(inputCols= assembler_input, outputCol="features", handleInvalid="skip") stages.append(assembler) pipeline = Pipeline(stages = stages) return pipeline
def build_pipeline (pipeconfig: dict) -> pyspark.ml.Pipeline: ''' Build a Pipeline instance based on config file :param pipeconfig: metadata dictionary :return: pyspark.ml.Pipeline ''' # Pipeline metadata cats = pipeconfig['variables']['categoricals'] nums = pipeconfig['variables']['numericals'] index_names = pipeconfig['metadata']['index_names'] encoded_names = pipeconfig['metadata']['encoded_names'] vect_name = pipeconfig['metadata']['vect_name'] feats_name = pipeconfig['metadata']['feats_name'] labelcol = pipeconfig['model']['labelCol'] maxdepth = pipeconfig['model']['maxDepth'] maxbins = pipeconfig['model']['maxBins'] maxiter = pipeconfig['model']['maxIter'] seed = pipeconfig['model']['seed'] # Build stages stageone = StringIndexer(inputCols=cats, outputCols=index_names) stagetwo = OneHotEncoder(dropLast=False, inputCols=stageone.getOutputCols(), outputCols=encoded_names) stagethree = VectorAssembler(inputCols=nums + stagetwo.getOutputCols(), outputCol=vect_name) stagefour = MinMaxScaler(inputCol=stagethree.getOutputCol(), outputCol=feats_name) stagefive = GBTClassifier(featuresCol=stagefour.getOutputCol(), labelCol=labelcol, maxDepth=maxdepth, maxBins=maxbins, maxIter=maxiter, seed=seed) pipeline = Pipeline(stages=[stageone, stagetwo, stagethree, stagefour, stagefive]) return pipeline
numTrees = 25 maxDepth = 5 maxBins = 5 # COMMAND ---------- # MAGIC %md # MAGIC #### 4.2 Initialize the Model # MAGIC # MAGIC In this step, we are only giving instructions to the algorithm and chaining it to the feature engineering steps. We will not train the model, yet. # COMMAND ---------- rf = RandomForestClassifier( labelCol="conversion", # Label we are trying to predict featuresCol=scaler.getOutputCol( ), # Feature names from last step of the pipeline numTrees=numTrees, # Parameters maxDepth=maxDepth, maxBins=maxBins) pipeline = Pipeline(stages=[ discretizer, # Feature engineering steps index_pipeline, encoder, vec_assembler, scaler, rf # Initialized model ]) # COMMAND ----------