def build_pipeline (pipeconfig: dict) -> pyspark.ml.Pipeline: ''' Build a Pipeline instance based on config file :param pipeconfig: metadata dictionary :return: pyspark.ml.Pipeline ''' # Pipeline metadata cats = pipeconfig['variables']['categoricals'] nums = pipeconfig['variables']['numericals'] index_names = pipeconfig['metadata']['index_names'] encoded_names = pipeconfig['metadata']['encoded_names'] vect_name = pipeconfig['metadata']['vect_name'] feats_name = pipeconfig['metadata']['feats_name'] labelcol = pipeconfig['model']['labelCol'] maxdepth = pipeconfig['model']['maxDepth'] maxbins = pipeconfig['model']['maxBins'] maxiter = pipeconfig['model']['maxIter'] seed = pipeconfig['model']['seed'] # Build stages stageone = StringIndexer(inputCols=cats, outputCols=index_names) stagetwo = OneHotEncoder(dropLast=False, inputCols=stageone.getOutputCols(), outputCols=encoded_names) stagethree = VectorAssembler(inputCols=nums + stagetwo.getOutputCols(), outputCol=vect_name) stagefour = MinMaxScaler(inputCol=stagethree.getOutputCol(), outputCol=feats_name) stagefive = GBTClassifier(featuresCol=stagefour.getOutputCol(), labelCol=labelcol, maxDepth=maxdepth, maxBins=maxbins, maxIter=maxiter, seed=seed) pipeline = Pipeline(stages=[stageone, stagetwo, stagethree, stagefour, stagefive]) return pipeline
def make_class_model(data, sc, model_path, model_name, target, ml_model='default', save=True): t0 = time() # Stages for pipline stages = [] # Index labels, adding metadata to the label column. # Fit on whole dataset to include all labels in index. targetIndexer = StringIndexer(inputCol="target", outputCol="indexedTarget", handleInvalid="keep").fit(data) stages += [targetIndexer] # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Identify categorical and numerical variables catCols = [ x for (x, dataType) in trainingData.dtypes if (((dataType == "string") | (dataType == "boolean")) & (x != "target")) ] numCols = [ x for (x, dataType) in trainingData.dtypes if ((dataType == "int") | (dataType == "bigint") | (dataType == "float") | (dataType == "double")) ] # OneHotEncode categorical variables indexers = [ StringIndexer(inputCol=column, outputCol=column + "-index", handleInvalid="keep") for column in catCols ] encoder = OneHotEncoder( inputCols=[indexer.getOutputCol() for indexer in indexers], outputCols=[ "{0}-encoded".format(indexer.getOutputCol()) for indexer in indexers ]) assembler_cat = VectorAssembler(inputCols=encoder.getOutputCols(), outputCol="categorical-features", handleInvalid="skip") stages += indexers stages += [encoder, assembler_cat] assembler_num = VectorAssembler(inputCols=numCols, outputCol="numerical-features", handleInvalid="skip") # Standardize numerical variables scaler = StandardScaler(inputCol="numerical-features", outputCol="numerical-features_scaled") # Combine all features in one vector assembler_all = VectorAssembler( inputCols=['categorical-features', 'numerical-features_scaled'], outputCol='features', handleInvalid="skip") stages += [assembler_num, scaler, assembler_all] # Train a RandomForest model by default or another specified model. if ml_model == 'default': rf = RandomForestClassifier(labelCol="indexedTarget", featuresCol="features", numTrees=10) else: rf = ml_model # Convert indexed labels back to original labels. labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel", labels=targetIndexer.labels) stages += [rf, labelConverter] # Chain indexers and forest in a Pipeline pipeline = Pipeline(stages=stages) # Train model. This also runs the indexers. model = pipeline.fit(trainingData) # Make predictions. predictions = model.transform(testData) # Select example rows to display. #predictions.select("predictedLabel", "target", "features").show(5) # Select (prediction, true label) and compute test error evaluator = MulticlassClassificationEvaluator(labelCol="indexedTarget", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Accuracy = %g" % (0.0 + accuracy)) if save: # Final model saving and statistics writing tt = time() - t0 timestamp = int(time()) model.write().overwrite().save(model_path) cluster = Cluster(['127.0.0.1'], "9042") session = cluster.connect("models") query = ( "INSERT INTO %s (model_name, timestamp, target, learning_time, model_path, stat)" ) % ("models_statistics") query = query + " VALUES (%s, %s, %s, %s, %s, %s)" session.execute( query, (model_name, timestamp, target, tt, model_path, accuracy)) session.shutdown() cluster.shutdown() # Stop spark session sc.stop() if not save: return model, sc
# In[24]: # Encode the categorical data categorical_variables = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native_country'] indexers = [StringIndexer(inputCol=column, outputCol=column+"-index") for column in categorical_variables] encoder = OneHotEncoderEstimator( inputCols=[indexer.getOutputCol() for indexer in indexers], outputCols=["{0}-encoded".format(indexer.getOutputCol()) for indexer in indexers] ) assembler = VectorAssembler( inputCols=encoder.getOutputCols(), outputCol="categorical-features" ) # In[25]: # # Create a Pipeline. pipeline = Pipeline(stages=indexers + [encoder, assembler]) pipelineModel = pipeline.fit(df_remove) model = pipelineModel.transform(df_remove) # In[26]: