def main(inputs,output): tmax_schema = types.StructType([ types.StructField('station', types.StringType()), types.StructField('date', types.DateType()), types.StructField('latitude', types.FloatType()), types.StructField('longitude', types.FloatType()), types.StructField('elevation', types.FloatType()), types.StructField('tmax', types.FloatType()), ]) data = spark.read.csv(inputs, schema=tmax_schema) train, validation = data.randomSplit([0.75, 0.25]) train = train.cache() validation = validation.cache() sqlTrans = SQLTransformer(statement = 'select *,dayofyear(date) as day FROM __THIS__') sqlTrans1 = SQLTransformer(statement = 'SELECT today.station,today.date,today.latitude,today.longitude,today.elevation,today.tmax, yesterday.tmax AS yesterday_tmax FROM __THIS__ as today INNER JOIN __THIS__ as yesterday ON date_sub(today.date, 1) = yesterday.date AND today.station = yesterday.station') assemble_features = VectorAssembler(inputCols = ['latitude','longitude','elevation','day','yesterday_tmax'], outputCol = 'features') gbt = GBTRegressor(featuresCol = 'features', labelCol='tmax') pipeline = Pipeline(stages=[sqlTrans1,sqlTrans,assemble_features,gbt]) weather_model = pipeline.fit(train) predictions = weather_model.transform(validation) #predictions.show() evaluator = RegressionEvaluator(labelCol = 'tmax', predictionCol = 'prediction', metricName = 'rmse') score = evaluator.evaluate(predictions) print("Root Mean Squared Error (RMSE) on test data = %g" % score) weather_model.write().overwrite().save(output)
def main(spark): train_file = 'hdfs:/user/bm106/pub/project/cf_train.parquet' cf_train = spark.read.parquet(train_file) valid_file = 'hdfs:/user/bm106/pub/project/cf_validation.parquet' cf_valid = spark.read.parquet(valid_file) test_file = 'hdfs:/user/bm106/pub/project/cf_test.parquet' cf_test = spark.read.parquet(test_file) indexer1 = StringIndexer(inputCol="user_id", outputCol="user_index", handleInvalid='skip') indexer2 = StringIndexer(inputCol="track_id", outputCol="item_index", handleInvalid='skip') logTrans = SQLTransformer(statement='select user_index,item_index,log10(count+1) as count from __THIS__') dropTrans = SQLTransformer(statement='select * from __THIS__ where count>1') pipeline1 = Pipeline(stages=[indexer1, indexer2]) pipeline2 = Pipeline(stages=[indexer1, indexer2, logTrans]) pipeline3 = Pipeline(stages=[indexer1, indexer2, dropTrans]) pre1 = pipeline1.fit(cf_train) pre2 = pipeline2.fit(cf_train) pre3 = pipeline3.fit(cf_train) pre1.transform(cf_train).write.parquet('hdfs:/user/sc6995/cf_train_index.parquet') pre1.transform(cf_valid).write.parquet('hdfs:/user/sc6995/cf_valid_index.parquet') pre1.transform(cf_test).write.parquet('hdfs:/user/sc6995/cf_test_index.parquet') pre2.transform(cf_train).write.parquet('hdfs:/user/sc6995/cf_train_logtrans.parquet') pre2.transform(cf_valid).write.parquet('hdfs:/user/sc6995/cf_valid_logtrans.parquet') pre2.transform(cf_test).write.parquet('hdfs:/user/sc6995/cf_test_logtrans.parquet') pre3.transform(cf_train).write.parquet('hdfs:/user/sc6995/cf_train_droptrans1.parquet')
def main(inputs, model_file): data = spark.read.csv(inputs, schema=schema()) train, validation = data.randomSplit([0.75, 0.25], seed=42) sql_transformer1 = SQLTransformer(statement=yes_tmax()) sql_transformer2 = SQLTransformer(statement=ret_query()) assemble_features = VectorAssembler(inputCols=[ 'latitude', 'longitude', 'elevation', 'dayofyear', 'yesterday_tmax' ], outputCol='features') regressor = GBTRegressor(featuresCol='features', labelCol='tmax') pipeline = Pipeline(stages=[ sql_transformer1, sql_transformer2, assemble_features, regressor ]) model = pipeline.fit(train) predictions = model.transform(validation) model.write().overwrite().save(model_file) r2_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='tmax', metricName='r2') r2 = r2_evaluator.evaluate(predictions) rmse_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='tmax', metricName='rmse') rmse = rmse_evaluator.evaluate(predictions) print("R-squared value : " + str(r2)) print("RMSE value : " + str(rmse))
def main_A(inputs): data = spark.read.option('encoding', 'UTF-8').csv(inputs, schema=tmax_schema) ################ FEATURE ENGINEERING: add yesterday tmax ##################### if USE_YTD_TEMP_FEATURE: syntax = """SELECT today.latitude,today.longitude,today.elevation,today.date, today.tmax, yesterday.tmax AS yesterday_tmax FROM __THIS__ as today INNER JOIN __THIS__ as yesterday ON date_sub(today.date, 1) = yesterday.date AND today.station = yesterday.station""" sql_trans = SQLTransformer(statement=syntax) df = sql_trans.transform(data) ############################################################################# df = data.withColumn('day_of_year', fn.dayofyear('date')) df = df.withColumn('year', fn.year('date')) df_long_lat = df[['station', 'longitude', 'latitude', 'tmax', 'year']].toPandas() count_year = df_long_lat['year'].value_counts().to_dict() # SELECT YEAR and DURATION YEAR_SELECTED = 2000 YEAR_DURATION = 20 df_long_lat = df_long_lat.loc[(df_long_lat['year'] > YEAR_SELECTED) & ( df_long_lat['year'] < YEAR_SELECTED + YEAR_DURATION)] # UNCLUSTER plot by finding avg temperature (groupby same station and year) df_long_lat['avg_temp'] = df_long_lat.groupby(['station', 'year' ])['tmax'].transform('mean') df_long_lat.drop_duplicates(subset=['station', 'year'], inplace=True) print(df_long_lat) world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres')) geometry = [ Point(xy) for xy in zip(df_long_lat['longitude'], df_long_lat['latitude']) ] df_long_lat = df_long_lat.drop(['longitude', 'latitude'], axis=1) crs = {'init': 'epsg:4326'} gdf = GeoDataFrame(df_long_lat, crs=crs, geometry=geometry) base = world.plot(color='white', edgecolor='black', figsize=(20, 12)) gdf.plot(column='avg_temp', ax=base, marker='o', cmap='jet', markersize=15, legend=True, legend_kwds={ 'label': "Temperature in Celcius", 'orientation': "horizontal" }) plt.title('Distribution of Temperature between ' + str(YEAR_SELECTED) + " and " + str(YEAR_SELECTED + YEAR_DURATION)) plt.savefig(inputs + "_" + str(YEAR_SELECTED) + "-" + str(YEAR_SELECTED + YEAR_DURATION))
def main(inputs): data = spark.read.csv(inputs, schema=colour_schema) train, validation = data.randomSplit([0.75, 0.25]) #use seed here train = train.cache() validation = validation.cache() #creating a pipeline to predict RGB colours -> word rgb_assembler = VectorAssembler(inputCols=['R', 'G', 'B'], outputCol="features") #dataframe1 = rgb_assembler.transform(data) word_indexer = StringIndexer(inputCol="word", outputCol="target", handleInvalid="error", stringOrderType="frequencyDesc") classifier = MultilayerPerceptronClassifier(featuresCol="features", labelCol="target", layers=[3, 25, 25]) rgb_pipeline = Pipeline(stages=[rgb_assembler, word_indexer, classifier]) rgb_model = rgb_pipeline.fit(train) #creating an evaluator and score the validation data #model_train = rgb_model.transform(train) evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="target") rgb_validation = rgb_model.transform(validation) score = evaluator.evaluate(rgb_validation, {evaluator.metricName: "accuracy"}) print('Validation score for RGB model: %g' % (score, )) plot_predictions(rgb_model, 'RGB', labelCol='target') rgb_to_lab_query = rgb2lab_query(passthrough_columns=['word']) # creating a pipeline to predict RGB colours -> word; train and evaluate. sqlTrans = SQLTransformer(statement=rgb_to_lab_query) labdata = sqlTrans.transform(data) ltrain, lvalidation = labdata.randomSplit([0.75, 0.25]) lrgb_assembler = VectorAssembler(inputCols=['labL', 'labA', 'labB'], outputCol="LAB") lword_indexer = StringIndexer(inputCol="word", outputCol="labTarget", handleInvalid="error", stringOrderType="frequencyDesc") lclassifier = MultilayerPerceptronClassifier(featuresCol="LAB", labelCol="labTarget", layers=[3, 25, 25]) lrgb_pipeline = Pipeline( stages=[sqlTrans, lrgb_assembler, lword_indexer, lclassifier]) lrgb_model = lrgb_pipeline.fit(ltrain) #lmodel_train = lrgb_model.transform(ltrain) lrgb_validation = lrgb_model.transform(lvalidation) print(lrgb_validation.show()) evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="labTarget") lscore = evaluator.evaluate(lrgb_validation, {evaluator.metricName: "accuracy"}) print('Validation score for LAB model: %g' % (lscore, )) plot_predictions(lrgb_model, 'LAB', labelCol='word')
def sql_transformer_usecase(): """ 通过sql方式实现对数据特征的转换 "_THIS_" 代表的是输入数据对应的dataset """ spark = getSparkSession() df = spark.createDataFrame([(0, 1.0, 3.0), (2, 2.0, 5.0)], ["id", "v1", "v2"]) sqlTrans = SQLTransformer( statement="SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__") sqlTrans.transform(df).show(truncate=False)
def query(self, sql_expression): """ Implements the transformations which are defined by SQL statement. Currently we only support SQL syntax like "SELECT ... FROM __THIS__ ..." where "__THIS__" represents the underlying table of the input dataframe. :param self: Spark Dataframe :param sql_expression: SQL expression. :return: Dataframe with columns changed by SQL statement. """ sql_transformer = SQLTransformer(statement=sql_expression) return sql_transformer.transform(self)
def main(inputs, model_file): data = spark.read.option('encoding', 'UTF-8').csv(inputs, schema=tmax_schema) ################ FEATURE ENGINEERING: add yesterday tmax ##################### if USE_YTD_TEMP_FEATURE: syntax = """SELECT today.latitude,today.longitude,today.elevation,today.date, today.tmax, yesterday.tmax AS yesterday_tmax FROM __THIS__ as today INNER JOIN __THIS__ as yesterday ON date_sub(today.date, 1) = yesterday.date AND today.station = yesterday.station""" sql_trans = SQLTransformer(statement=syntax) data = sql_trans.transform(data) ############################################################################# data = data.withColumn('day_of_year', fn.dayofyear('date')) train, validation = data.randomSplit([0.75, 0.25]) train = train.cache() validation = validation.cache() if USE_YTD_TEMP_FEATURE: train_feature_assembler = VectorAssembler(inputCols=[ 'yesterday_tmax', 'day_of_year', 'latitude', 'longitude', 'elevation' ], outputCol='features') else: train_feature_assembler = VectorAssembler( inputCols=['day_of_year', 'latitude', 'longitude', 'elevation'], outputCol='features') ############# DIFFERENT ML ALGORITHMS TO BE USED #################### # classifier = GeneralizedLinearRegression(featuresCol = 'features', labelCol='tmax' ) # classifier = GBTRegressor( maxDepth=5,featuresCol = 'features', labelCol='tmax' ) classifier = RandomForestRegressor(numTrees=7, maxDepth=8, featuresCol='features', labelCol='tmax') ##################################################################### train_pipeline = Pipeline(stages=[train_feature_assembler, classifier]) weather_model = train_pipeline.fit(train) prediction = weather_model.transform(validation) # print(prediction.show()) evaluator = RegressionEvaluator(predictionCol="prediction", labelCol='tmax', metricName='r2') #rmse score = evaluator.evaluate(prediction) print('Validation score for weather model: %g' % (score, )) weather_model.write().overwrite().save(model_file)
def main(inputs): data = spark.read.csv(inputs, schema=colour_schema) train, validation = data.randomSplit([0.75, 0.25]) train = train.cache() validation = validation.cache() # TODO: create a pipeline to predict RGB colours -> word rgb_assembler = VectorAssembler(inputCols=['R', 'G', 'B'], outputCol='features') word_indexer = StringIndexer(inputCol='word', outputCol='new_word') classifier = MultilayerPerceptronClassifier(labelCol="new_word", layers=[3, 30, 11]) rgb_pipeline = Pipeline(stages=[rgb_assembler, word_indexer, classifier]) rgb_model = rgb_pipeline.fit(train) # TODO: create an evaluator and score the validation data rgb_validation = rgb_model.transform(validation) # rgb_validation.show() plot_predictions(rgb_model, 'RGB', labelCol='word') vali_evaluator = MulticlassClassificationEvaluator( predictionCol="prediction", labelCol='new_word') score = vali_evaluator.evaluate(rgb_validation) print('Validation score for RGB model: %g' % (score, )) # TODO: create a pipeline RGB colours -> LAB colours -> word; train and evaluate. rgb_to_lab_query = rgb2lab_query(passthrough_columns=['word']) sql_transformer = SQLTransformer(statement=rgb_to_lab_query) new_assembler = VectorAssembler(inputCols=['labL', 'labA', 'labB'], outputCol='features') new_pipeline = Pipeline( stages=[sql_transformer, new_assembler, word_indexer, classifier]) new_training = sql_transformer.transform(train) new_model = new_pipeline.fit(new_training) new_validation = new_model.transform(validation) #new_validation.show() new_vali_evaluator = MulticlassClassificationEvaluator( predictionCol='prediction', labelCol='new_word') new_score = new_vali_evaluator.evaluate(new_validation) print('Validation score for LAB model:', new_score) print('Validation score for LAB model:', new_score) print('Validation score for LAB model:', new_score) plot_predictions(new_model, 'LAB', labelCol="word")
def main(inputs): data = spark.read.csv(inputs, schema=colour_schema) train, validation = data.randomSplit([0.75, 0.25]) train = train.cache() validation = validation.cache() word_indexer = StringIndexer(inputCol = "word", outputCol = "labelCol", handleInvalid = 'error') classifier = MultilayerPerceptronClassifier(maxIter = 400, layers = [3, 30, 11], blockSize = 1, seed = 123, labelCol = "labelCol") # TODO: create a pipeline to predict RGB colours -> word rgb_assembler = VectorAssembler(inputCols = ['R', 'G', 'B'], outputCol = "features") classifier = MultilayerPerceptronClassifier(maxIter = 400, layers = [3, 30, 11], blockSize = 1, seed = 123, labelCol = "labelCol") rgb_pipeline = Pipeline(stages=[rgb_assembler, word_indexer, classifier]) rgb_model = rgb_pipeline.fit(train) # TODO: create an evaluator and score the validation data evaluator = MulticlassClassificationEvaluator(labelCol = "labelCol" , predictionCol = "prediction") predictions = rgb_model.transform(validation) score = evaluator.evaluate(predictions) plot_predictions(rgb_model, 'RGB', labelCol='word') print('Validation score for RGB model: %g' % (score, )) rgb_to_lab_query = rgb2lab_query(passthrough_columns=["word"]) sqlTrans = SQLTransformer(statement = rgb_to_lab_query) # TODO: create a pipeline to predict RGB colours -> word; train and evaluate. lab_assembler = VectorAssembler(inputCols = ['labL', 'labA', 'labB'], outputCol = "features") lab_pipeline = Pipeline(stages=[sqlTrans,lab_assembler, word_indexer, classifier]) lab_model = lab_pipeline.fit(train) predictions_lab = lab_model.transform(validation) score_lab = evaluator.evaluate(predictions_lab) plot_predictions(lab_model, 'LAB', labelCol='word') print('Validation score for LAB model: %g' % (score_lab, ))
def main(inputs, model_file): sensor_data_df = spark.read.format("org.apache.spark.sql.cassandra").options(table=sensor_data_table, keyspace=keyspace).load() # creating a ML pipeline sensor_data_df = sensor_data_df.select(sensor_data_df['datetime'], sensor_data_df['latitude'], sensor_data_df['longitude'], sensor_data_df['message_code_id'], sensor_data_df['sensor_reading'], sensor_data_df['sensor_name']).orderBy(sensor_data_df['datetime'].asc()) train_set, validation_set = sensor_data_df.randomSplit([0.75, 0.25]) train_set.catch() validation_set.catch() sql_transformer_statement = "SELECT latitude, longitude, sensor_name, sensor_reading, message_code_id" \ "FROM __THIS__" sql_transformer = SQLTransformer(statement=sql_transformer_statement) assemble_features = VectorAssembler(inputCols=['latitude', 'longitude', 'sensor_name', 'sensor_reading'] , outputCol= 'features') classifier = GBTRegressor(featuresCol='features', labelCol='message_code_id') pipeline = Pipeline(stages=[sql_transformer, assemble_features, classifier]) model = pipeline.fit(train_set) predictions = model.tranform(validation_set) predictions.show() r2_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='message_code_id', metricName='r2') rmse_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='message_code_id', metricName='rmse') r2_score = r2_evaluator.evaluate(predictions) rmse_score = rmse_evaluator.evaluate(predictions) print('r2 validation score : ', r2_score) print('rmse validation score: ', rmse_score)
def make_weather_trainers(trainRatio, estimator_gridbuilders, metricName=None): """Construct a list of TrainValidationSplit estimators for weather data where `estimator_gridbuilders` is a list of (Estimator, ParamGridBuilder) tuples and 0 < `trainRatio` <= 1 determines the fraction of rows used for training. The RegressionEvaluator will use a non-default `metricName`, if specified. """ feature_cols = ['latitude', 'longitude', 'elevation', 'doy'] column_names = dict(featuresCol="features", labelCol="tmax", predictionCol="tmax_pred") getDOY = doy_query() sqlTrans = SQLTransformer(statement=getDOY) feature_assembler = VectorAssembler(inputCols=feature_cols, outputCol=column_names["featuresCol"]) ev = (RegressionEvaluator().setLabelCol( column_names["labelCol"]).setPredictionCol( column_names["predictionCol"])) if metricName: ev = ev.setMetricName(metricName) tvs_list = [] for est, pgb in estimator_gridbuilders: est = est.setParams(**column_names) pl = Pipeline(stages=[sqlTrans, feature_assembler, est]) paramGrid = pgb.build() tvs_list.append( TrainValidationSplit(estimator=pl, estimatorParamMaps=paramGrid, evaluator=ev, trainRatio=trainRatio)) return tvs_list
def runTest(self): document_assembler = DocumentAssembler() \ .setInputCol("text") \ .setOutputCol("document") sentence_detector = SentenceDetector() \ .setInputCols(["document"]) \ .setOutputCol("sentence") tokenizer = Tokenizer() \ .setInputCols(["sentence"]) \ .setOutputCol("token") glove = WordEmbeddingsModel.pretrained() \ .setInputCols(["sentence", "token"]) \ .setOutputCol("embeddings") sentence_embeddings = SentenceEmbeddings() \ .setInputCols(["sentence", "embeddings"]) \ .setOutputCol("sentence_embeddings") \ .setPoolingStrategy("AVERAGE") embeddings_finisher = EmbeddingsFinisher() \ .setInputCols("sentence_embeddings") \ .setOutputCols("sentence_embeddings_vectors") \ .setOutputAsVector(True) explode_vectors = SQLTransformer( statement= "SELECT EXPLODE(sentence_embeddings_vectors) AS features, * FROM __THIS__" ) kmeans = KMeans().setK(2).setSeed(1).setFeaturesCol("features") pipeline = Pipeline(stages=[ document_assembler, sentence_detector, tokenizer, glove, sentence_embeddings, embeddings_finisher, explode_vectors, kmeans ]) model = pipeline.fit(self.data) model.transform(self.data).show()
def train_model(model_file, inputs): # get the data train_tmax = spark.read.csv(inputs, schema=tmax_schema) train, validation = train_tmax.randomSplit([0.75, 0.25], seed=110) #query ="SELECT station,date, dayofyear(date) as doy, latitude, longitude, elevation,tmax FROM __THIS__" query = """SELECT today.station, dayofyear(today.date) as doy, today.latitude, today.longitude, today.elevation, today.tmax, yesterday.tmax AS yesterday_tmax FROM __THIS__ as today INNER JOIN __THIS__ as yesterday ON date_sub(today.date, 1) = yesterday.date AND today.station = yesterday.station""" #weather_assembler = VectorAssembler(inputCols=['latitude','longitude','elevation', 'doy'], outputCol="features") weather_assembler = VectorAssembler(inputCols=['latitude','longitude','elevation', 'doy', 'yesterday_tmax'], outputCol="features") regressor = GBTRegressor(maxIter=50,maxDepth=5,featuresCol="features",labelCol="tmax") transquery = SQLTransformer(statement=query) pipeline = Pipeline(stages=[transquery,weather_assembler,regressor]) model = pipeline.fit(train) model.write().overwrite().save(model_file) # use the model to make predictions predictions = model.transform(validation) #predictions.show() # evaluate the predictions r2_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='tmax', metricName='r2') r2 = r2_evaluator.evaluate(predictions) rmse_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='tmax', metricName='rmse') rmse = rmse_evaluator.evaluate(predictions) print('r2 =', r2) print('rmse =', rmse)
def main(inputs, model_file): data = spark.read.csv(inputs, schema=tmax_schema) data.registerTempTable('yesterday') #wthr_query = """SELECT dayofyear(date) as dayofyr, latitude, longitude, elevation,tmax FROM __THIS__""" wthr_query = """SELECT dayofyear(today.date) as dayofyr,today.latitude, today.longitude, today.elevation, today.tmax, yesterday.tmax as yesterday_tmax FROM __THIS__ as today INNER JOIN __THIS__ as yesterday ON date_sub(today.date, 1) = yesterday.date AND today.station = yesterday.station""" train, validation = data.randomSplit([0.75, 0.25]) train = train.cache() validation = validation.cache() #define the assembler and regressor assembler = VectorAssembler(inputCols=["latitude", "longitude", "elevation", "dayofyr" ], outputCol="features") regressor = RandomForestRegressor(maxDepth=10, minInstancesPerNode=2, minInfoGain=0.5, labelCol = "tmax") trans_query = SQLTransformer(statement = wthr_query) #define pipeline and model wthr_pipeline = Pipeline(stages=[trans_query, assembler, regressor]) wthr_model = wthr_pipeline.fit(train) #define the regression evaluator evaluator = RegressionEvaluator(labelCol="tmax", predictionCol="prediction") predictions = wthr_model.transform(validation) err = evaluator.evaluate(predictions) wthr_model.write().overwrite().save(model_file) print('Root Mean Square Error(rmse) : ' + str(err))
def test_simple_csv_loader_pipeline(spark_session: SparkSession) -> None: # Arrange data_dir: Path = Path(__file__).parent.joinpath('./') flights_path: str = f"file://{data_dir.joinpath('flights.csv')}" schema = StructType([]) df: DataFrame = spark_session.createDataFrame( spark_session.sparkContext.emptyRDD(), schema) # noinspection SqlDialectInspection,SqlNoDataSourceInspection spark_session.sql("DROP TABLE IF EXISTS default.flights") # Act # parameters = Dict[str, Any]({ # }) stages: List[Union[Estimator, Transformer]] = [ FrameworkCsvLoader( view="flights", path_to_csv=flights_path ), SQLTransformer(statement="SELECT * FROM flights"), ] pipeline: Pipeline = Pipeline(stages=stages) transformer = pipeline.fit(df) result_df: DataFrame = transformer.transform(df) # Assert result_df.show() assert result_df.count() > 0
def get_glm_explain_stages(predictions_view: str, coefficients_view: str, label_column: str, family: str = 'tweedie', link: str = 'identity', variance_power: float = 0.0, link_power: float = 1.0) -> List: link_function_type = resolve_link_function(family, link, link_power) stages = [ OneHotDecoder(oheSuffix="_OHE", idxSuffix="_IDX", unknownSuffix="Unknown"), SQLTransformer( statement= f"CREATE OR REPLACE TEMPORARY VIEW {predictions_view} AS SELECT * from __THIS__" ), GLMExplainTransformer(predictionView=predictions_view, coefficientView=coefficients_view, linkFunctionType=link_function_type, label=label_column, nested=True, calculateSum=True, family=family, variancePower=variance_power, linkPower=link_power) ] return stages
def main(inputs): data = spark.read.csv(inputs, schema=colour_schema) train, validation = data.randomSplit([0.75, 0.25]) train = train.cache() validation = validation.cache() # TODO: create a pipeline to predict RGB colours -> word rgb_assembler = VectorAssembler(inputCols=['R','G','B'], outputCol='features') word_indexer = StringIndexer(inputCol='word', outputCol='label') classifier = MultilayerPerceptronClassifier(layers=[3, 30, 11]) rgb_pipeline = Pipeline(stages=[rgb_assembler, word_indexer, classifier]) rgb_model = rgb_pipeline.fit(train) # TODO: create an evaluator and score the validation data predictions = rgb_model.transform(validation) rgb_evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label', metricName='f1') score = rgb_evaluator.evaluate(predictions) plot_predictions(rgb_model, 'RGB', labelCol='word') print('Validation score for RGB model: %g' % (score, )) rgb_to_lab_query = rgb2lab_query(passthrough_columns=['word']) # TODO: create a pipeline RGB colours -> LAB colours -> word; train and evaluate. lab = SQLTransformer(statement=rgb_to_lab_query) lab_assembler = VectorAssembler(inputCols=['labL','labA','labB'], outputCol='features') lab_pipeline = Pipeline(stages=[lab, lab_assembler, word_indexer, classifier]) lab_model = lab_pipeline.fit(train) plot_predictions(lab_model, 'LAB', labelCol='word') lab_predictions = lab_model.transform(validation) lab_score = rgb_evaluator.evaluate(lab_predictions) print('Validation score for LAB model:', lab_score)
def model_train(input,model_path): tmax_schema = types.StructType([ types.StructField('station', types.StringType()), types.StructField('date', types.DateType()), types.StructField('latitude', types.FloatType()), types.StructField('longitude', types.FloatType()), types.StructField('elevation', types.FloatType()), types.StructField('tmax', types.FloatType()), ]) data = spark.read.csv(input,schema= tmax_schema) train, validation = data.randomSplit([0.75,0.25]) train = train.cache() validation = validation.cache() sql_query = """SELECT today.latitude, today.longitude, today.elevation, dayofyear(today.date) AS dy,yesterday.tmax AS yesterday_tmax, today.tmax FROM __THIS__ as today INNER JOIN __THIS__ as yesterday ON date_sub(today.date, 1) = yesterday.date AND today.station = yesterday.station""" transformer = SQLTransformer(statement=sql_query) assemble_features = VectorAssembler(inputCols=['latitude','longitude','elevation','dy','yesterday_tmax'],outputCol='features') regressor = DecisionTreeRegressor(featuresCol='features',labelCol='tmax') weather_pipeline = Pipeline(stages=[transformer,assemble_features,regressor]) model = weather_pipeline.fit(train) model.write().overwrite().save(model_path) prediction = model.transform(validation) #Scoring the model evaluator = RegressionEvaluator(predictionCol='prediction',labelCol='tmax',metricName='rmse') score = evaluator.evaluate(prediction) print("Score of the weather model is",score)
def add_data_cleaner(): ''' OUTPUT: stages - (list) list of transformer to be used as 'stages' argument of pyspark Pipeline() constructor DESCRIPTION: This is a subroutine of create_preprocess_pipeline() function. Stages added by this function will clean raw pyspark dataframe for next steps. ''' stages = [] # pipeline stage list # filter rows with userId==Null or sessionId==Null, just in case sqlTrans = SQLTransformer(statement="SELECT *\ FROM __THIS__\ WHERE userId IS NOT NULL\ AND sessionId IS NOT NULL") stages.append(sqlTrans) # drop empty user id row sqlTrans = SQLTransformer(statement="SELECT *\ FROM __THIS__\ WHERE userId != ''") stages.append(sqlTrans) # drop 'Logged-Out' state and 'Guest' state sqlTrans = SQLTransformer(statement="SELECT *\ FROM __THIS__\ WHERE auth != 'Logged Out' AND auth != 'Guest'") stages.append(sqlTrans) # exclude rows with user who has only one song play or less sqlTrans = SQLTransformer(statement=" \ SELECT * \ FROM __THIS__ \ WHERE userId NOT IN ( \ SELECT DISTINCT userId \ FROM \ (SELECT userId, page, \ COUNT(CASE WHEN page = 'NextSong' THEN page END) \ OVER(PARTITION BY userId) AS songCount \ FROM __THIS__) AS user_page_count \ WHERE user_page_count.songCount < 2)") stages.append(sqlTrans) return stages
def construct_pipeline(): feature_columns = [ "season", "yr", "mnth", "holiday", "weekday", "workingday", "weathersit", "temp", "atemp", "hum", "windspeed" ] sql_transformer = SQLTransformer(statement=""" SELECT cast(season as int), cast(yr as int), cast(mnth as int), cast(holiday as int), cast(weekday as int), cast(workingday as int), cast(weathersit as int), cast(temp as double), cast(atemp as double), cast(hum as double), cast(windspeed as double), cast(cnt as int) as label FROM __THIS__ """) assembler = VectorAssembler().setInputCols(feature_columns[:-1]).setOutputCol("features") feature_label_tf = SQLTransformer(statement=""" SELECT features, label FROM __THIS__ """) pipeline = Pipeline(stages=[ sql_transformer, assembler, feature_label_tf, LinearRegression() ]) return pipeline
def main(input, model_file): tmax_schema = types.StructType([ types.StructField('station', types.StringType()), types.StructField('date', types.DateType()), types.StructField('latitude', types.FloatType()), types.StructField('longitude', types.FloatType()), types.StructField('elevation', types.FloatType()), types.StructField('tmax', types.FloatType()), ]) data = spark.read.csv(input, schema=tmax_schema) train, validation = data.randomSplit([0.75, 0.25], seed=123) train = train.cache() validation = validation.cache() y_tmax = SQLTransformer( statement= "SELECT today.station,today.latitude,today.longitude,today.elevation,today.date,today.tmax,yesterday.tmax AS yesterday_tmax FROM __THIS__ as today INNER JOIN __THIS__ as yesterday ON date_sub(today.date, 1) = yesterday.date AND today.station = yesterday.station" ) getvalues = SQLTransformer( statement= "SELECT station,latitude,longitude,elevation,dayofyear(date) AS dayofyear,tmax,yesterday_tmax from __THIS__" ) assemble_features = VectorAssembler(inputCols=[ 'latitude', 'longitude', 'elevation', 'dayofyear', 'yesterday_tmax' ], outputCol='features') classifier = GBTRegressor(featuresCol='features', labelCol='tmax') pipeline = Pipeline( stages=[y_tmax, getvalues, assemble_features, classifier]) model = pipeline.fit(train) predictions = model.transform(validation) r2_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='tmax', metricName='r2') r2 = r2_evaluator.evaluate(predictions) print('-----------------------------------') print('r2: %g' % (r2, )) print('-----------------------------------') rmse_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='tmax', metricName='rmse') rmse = rmse_evaluator.evaluate(predictions) print('rmse: %g' % (rmse, )) model.write().overwrite().save(model_file)
def main(inputs): data = spark.read.csv(inputs, header=True, schema=colour_schema) lab_query = rgb2lab_query(passthrough_columns=['labelword']) # TODO: actually build the components for the pipelines, and the pipelines. indexer = StringIndexer(inputCol="labelword", outputCol="labelCol", handleInvalid='error') rgb_assembler = VectorAssembler(inputCols=['R', 'G', 'B'], outputCol="features") lab_assembler = VectorAssembler(inputCols=['lL', 'lA', 'lB'], outputCol="features") forest = RandomForestClassifier(numTrees=22, maxDepth=10, labelCol="labelCol", seed=42) mlp = MultilayerPerceptronClassifier(maxIter=400, layers=[3, 16, 11], blockSize=1, seed=123, labelCol="labelCol") sqlTrans = SQLTransformer(statement=lab_query) models = [ ('RGB-forest', Pipeline(stages=[indexer, rgb_assembler, forest])), ('LAB-forest', Pipeline(stages=[sqlTrans, indexer, lab_assembler, forest])), ('RGB-MLP', Pipeline(stages=[indexer, rgb_assembler, mlp])), ('LAB-MLP', Pipeline(stages=[sqlTrans, indexer, lab_assembler, mlp])), ] # TODO: need an evaluator evaluator = MulticlassClassificationEvaluator(labelCol="labelCol", predictionCol="prediction") # TODO: split data into training and testing train, test = data.randomSplit([0.75, 0.25]) train = train.cache() test = test.cache() score_dict = dict() for label, pipeline in models: # TODO: fit the pipeline to create a model model = pipeline.fit(train) # Output a visual representation of the predictions we're # making: uncomment when you have a model working plot_predictions(model, label) # TODO: predict on the test data predictions = model.transform(test) # calculate a score score = evaluator.evaluate(predictions) score_dict[label] = score return score_dict
def main(inputs): # Read the CSV File df = spark.read.csv(inputs, schema=colour_schema) # Total label count label_num = df.select('word').distinct().count() # Split the dataset. Make 75% as training set and the remaining 25% as validation set train, validation = df.randomSplit([0.75, 0.25]) train = train.cache() validation = validation.cache() # Creating pipeline rgb_assembler = VectorAssembler(inputCols=["R", "G", "B"], outputCol="features") word_indexer = StringIndexer(inputCol="word", outputCol="label", handleInvalid="error") classifier_mpc = MultilayerPerceptronClassifier(layers=[3, 250, label_num]) # Transformer for the lab pipeline rgb_to_lab_query = rgb2lab_query(passthrough_columns=['word']) sqlTrans = SQLTransformer(statement=rgb_to_lab_query) lab_assembler = VectorAssembler(inputCols=["labL", "labA", "labB"], outputCol="features") # TODO: create a pipeline to predict RGB colours -> word; train and evaluate. # pipeline to predict RGB colours rgb_pipeline = Pipeline( stages=[rgb_assembler, word_indexer, classifier_mpc]) lab_pipeline = Pipeline( stages=[sqlTrans, lab_assembler, word_indexer, classifier_mpc]) # Train the model rgb_model = rgb_pipeline.fit(train) lab_model = lab_pipeline.fit(train) # Transform the validation set predictions_rgb = rgb_model.transform(validation) predictions_lab = lab_model.transform(validation) # TODO: create an evaluator and score the validation data # Create a Multiclass Classification Evaluator evaluator = MulticlassClassificationEvaluator(predictionCol="prediction") # Evaluate it on validation data score_rgb = evaluator.evaluate(predictions_rgb) score_lab = evaluator.evaluate(predictions_lab) plot_predictions(rgb_model, 'RGB', labelCol='word') plot_predictions(lab_model, 'LAB', labelCol='word') # Print the validation scores print('Validation score for RGB model: %g' % (score_rgb, )) print('Validation score for LAB model: %g' % (score_lab, ))
def main(inputs, outputs): data = spark.read.csv(inputs, schema=tmax_schema) train, validation = data.randomSplit([0.75, 0.25]) train = train.cache() validation = validation.cache() query1 = """SELECT *, dayofyear( date ) AS day FROM __THIS__ """ # TODO: create a pipeline to predict Latitude, Longtitude, Elevation point -> tmax query = """SELECT today.station as station , today.latitude as latitude, today.longitude as longitude, today.elevation as elevation, dayofyear( today.date ) AS day, today.tmax, yesterday.tmax AS yesterday_tmax FROM __THIS__ as today INNER JOIN __THIS__ as yesterday ON date_sub(today.date, 1) = yesterday.date AND today.station = yesterday.station""" sqlTrans = SQLTransformer(statement=query) lle_assembler = VectorAssembler(inputCols=[ "latitude", "longitude", "elevation", "day", "tmax", "yesterday_tmax" ], outputCol="features") tmax_indexer = StringIndexer(inputCol="station", outputCol="label", handleInvalid='error') regressor = GBTRegressor(featuresCol='features', labelCol='tmax', maxIter=100) pipeline = Pipeline( stages=[sqlTrans, lle_assembler, tmax_indexer, regressor]) model = pipeline.fit(train) # TODO: create an evaluator and score the validation predictions = model.transform(validation) r2_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='tmax', metricName='r2') r2 = r2_evaluator.evaluate(predictions) rmse_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='tmax', metricName='rmse') rmse = rmse_evaluator.evaluate(predictions) model.write().overwrite().save(outputs) print('r2 =', r2) print('rmse =', rmse)
def add_label_maker(stages): ''' INPUT: stages - (list) list of transformer to be used as 'stages' argument of pyspark Pipeline() constructor It should be an output of 'create_data_cleaner()' function. OUTPUT: stages - (list) list of transformer to be used as 'stages' argument of pyspark Pipeline() constructor DESCRIPTION: This is a subroutine of create_preprocess_pipeline() function. Stages added by this function will make label column in target pyspark dataframe. It also drops rows which the label column directly depends on. ''' # 'churn_event' # add a column to store churn event as integer sqlTrans = SQLTransformer(statement=" \ SELECT *, \ CASE WHEN page = 'Cancellation Confirmation'\ THEN 1 ELSE 0 END AS churn_event \ FROM __THIS__") stages.append(sqlTrans) # 'Churn' # add a column to store cumulative sum of churn flag sqlTrans = SQLTransformer(statement=" \ SELECT *, \ MAX(churn_event) OVER ( \ PARTITION BY userId \ ) AS Churn \ FROM __THIS__") stages.append(sqlTrans) return stages
def deriveNewMethod(df): from pyspark.ml.feature import SQLTransformer # 把空值去掉 df = df.filter(df['area'].isNotNull()) df = df.filter(df['price'].isNotNull()) df = df.filter(df['room_num'].isNotNull()) #df = df.filter(df['area'] !='NULL') #df = df.filter(df['price'] !='NULL') #df = df.filter(df['room_num'] !='NULL') # 去除点值为0的行 df = df.filter(df['area'] != 0) df = df.filter(df['room_num'] != 0) df = df.select('*', df['area'].cast('Float').alias('tmp_name')).drop('area') df = df.withColumnRenamed('tmp_name', 'area') df = df.select('*', df['price'].cast('Float').alias('tmp_name')).drop('price') df = df.withColumnRenamed('tmp_name', 'price') df = df.select( '*', df['room_num'].cast('Float').alias('tmp_name')).drop('room_num') df = df.withColumnRenamed('tmp_name', 'room_num') print(df.dtypes) sqlTransform = SQLTransformer( statement= 'SELECT *,(area/room_num) AS one_room_area, (price/area) AS one_area_price FROM __THIS__' ) df = sqlTransform.transform(df) # spark.stop() return df
def main(inputs, model_file): # get the data test_tmax = spark.read.csv(inputs, schema=tmax_schema) train, validation = test_tmax.randomSplit([0.75, 0.25]) # with yesterday feature, the code is as following: sql_query = 'SELECT today.latitude as latitude, today.longitude as longitude, today.elevation as elevation, dayofyear(today.date) as dayofyear, today.tmax as tmax, yesterday.tmax AS y_tmax FROM __THIS__ as today INNER JOIN __THIS__ as yesterday ON date_sub(today.date, 1) = yesterday.date AND today.station = yesterday.station' sql_transformer = SQLTransformer(statement=sql_query) assembler = VectorAssembler(inputCols=['latitude', 'longitude', 'elevation', 'dayofyear', 'y_tmax'], outputCol='features') classifier = GBTRegressor(featuresCol='features', labelCol='tmax') pipelineModel = Pipeline(stages=[sql_transformer, assembler, classifier]) # # without yesterday feature, the code is as following: # sql_query = 'SELECT today.latitude as latitude, today.longitude as longitude, today.elevation as elevation, dayofyear(today.date) as dayofyear, today.tmax as tmax FROM __THIS__ as today INNER JOIN __THIS__ as yesterday ON date_sub(today.date, 1) = yesterday.date AND today.station = yesterday.station' # # sql_transformer = SQLTransformer(statement=sql_query) # assembler = VectorAssembler(inputCols=['latitude', 'longitude', 'elevation', 'dayofyear'], outputCol='features') # # classifier = GBTRegressor(featuresCol='features', labelCol='tmax') # pipelineModel = Pipeline(stages=[sql_transformer, assembler, classifier]) # load the model model = pipelineModel.fit(train) # model = PipelineModel.load(train) # use the model to make predictions predictions = model.transform(validation) #predictions.show() # evaluate the predictions r2_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='tmax', metricName='r2') r2 = r2_evaluator.evaluate(predictions) rmse_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='tmax', metricName='rmse') rmse = rmse_evaluator.evaluate(predictions) print('r2 =', r2) print('rmse =', rmse) # If you used a regressor that gives .featureImportances, maybe have a look... #print(model.stages[-1].featureImportances) model.write().overwrite().save(model_file)
def main(inputs, output): tmax_schema = types.StructType([ types.StructField('station', types.StringType()), types.StructField('date', types.DateType()), types.StructField('latitude', types.FloatType()), types.StructField('longitude', types.FloatType()), types.StructField('elevation', types.FloatType()), types.StructField('tmax', types.FloatType()), ]) data = spark.read.csv(inputs, schema=tmax_schema) query = "SELECT t.station AS station, t.date AS date, t.day AS day, t.latitude AS latitude, t.longitude AS longitude, t.elevation AS elevation, t.tmax AS tmax, y.tmax AS tmax_yesterday FROM (SELECT station, date, latitude, longitude, elevation, tmax, DAYOFYEAR(date) AS day, date_sub(date,1) AS date_yesterday FROM __THIS__) t, (SELECT station, date, latitude, longitude, elevation, tmax, DAYOFYEAR(date) AS day, date_sub(date,1) AS date_yesterday FROM __THIS__) y WHERE t.date = y.date_yesterday AND t.station = y.station" sqlTrans = SQLTransformer(statement=query) train, validation = data.randomSplit([0.75, 0.25]) train = train.cache() # train.show() validation = validation.cache() assembler = VectorAssembler(inputCols=[ "latitude", "longitude", "elevation", "day", "tmax_yesterday" ], outputCol="features") classifier = GBTRegressor(featuresCol='features', labelCol='tmax') pipeline = Pipeline(stages=[sqlTrans, assembler, classifier]) model = pipeline.fit(train) predictions = model.transform(validation) predictions.show() r2_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='tmax', metricName='r2') r2 = r2_evaluator.evaluate(predictions) print("R-square for the validation data is: " + str(r2)) model.write().overwrite().save(output) r2 = r2_evaluator.evaluate(model.transform(train)) print("R-square for the training data is: " + str(r2)) print(model.stages[-1].featureImportances) sfu_predict = [("sfu", datetime.date(2018, 11, 12), 49.2771, -122.9146, 330.0, 12.0), ("sfu", datetime.date(2018, 11, 13), 49.2771, -122.9146, 330.0, 12.0)] sfu_predict_df = spark.createDataFrame(sfu_predict, schema=tmax_schema) sfu_predict_df.show() sfu_predictions = model.transform(sfu_predict_df).select( 'station', 'date', 'prediction') sfu_predictions.show()
def lab_classify(type,train,validation,query,figName): sql_transformer = SQLTransformer(statement = query) lab_assembler = VectorAssembler(inputCols=['labL','labA','labB'],outputCol='features') word_indexer = StringIndexer(inputCol='word',outputCol='label',stringOrderType='alphabetAsc') if (type == "MLPC"): classifier = MultilayerPerceptronClassifier(layers=[3, 25, 25],seed=42) elif (type == "LogReg"): classifier = LogisticRegression() lab_pipe = Pipeline(stages=[sql_transformer,lab_assembler, word_indexer, classifier]) lab_model = lab_pipe.fit(train) predictions = lab_model.transform(validation) evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label', metricName='accuracy') score = evaluator.evaluate(predictions) plot_predictions(lab_model, 'LAB_'+figName, labelCol='word') return score
# Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from __future__ import print_function from pyspark import SparkContext # $example on$ from pyspark.ml.feature import SQLTransformer # $example off$ from pyspark.sql import SQLContext if __name__ == "__main__": sc = SparkContext(appName="SQLTransformerExample") sqlContext = SQLContext(sc) # $example on$ df = sqlContext.createDataFrame([ (0, 1.0, 3.0), (2, 2.0, 5.0) ], ["id", "v1", "v2"]) sqlTrans = SQLTransformer( statement="SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__") sqlTrans.transform(df).show() # $example off$ sc.stop()
# COMMAND ---------- from pyspark.ml.feature import RFormula supervised = RFormula(formula="lab ~ . + color:value1 + color:value2") supervised.fit(simpleDF).transform(simpleDF).show() # COMMAND ---------- from pyspark.ml.feature import SQLTransformer basicTransformation = SQLTransformer()\ .setStatement(""" SELECT sum(Quantity), count(*), CustomerID FROM __THIS__ GROUP BY CustomerID """) basicTransformation.transform(sales).show() # COMMAND ---------- from pyspark.ml.feature import VectorAssembler va = VectorAssembler().setInputCols(["int1", "int2", "int3"]) va.transform(fakeIntDF).show() # COMMAND ----------