def main(inputs,output):
    tmax_schema = types.StructType([
    types.StructField('station', types.StringType()),
    types.StructField('date', types.DateType()),
    types.StructField('latitude', types.FloatType()),
    types.StructField('longitude', types.FloatType()),
    types.StructField('elevation', types.FloatType()),
    types.StructField('tmax', types.FloatType()),
    ])

    data = spark.read.csv(inputs, schema=tmax_schema)
    train, validation = data.randomSplit([0.75, 0.25])
    train = train.cache()
    validation = validation.cache()

    sqlTrans = SQLTransformer(statement = 'select *,dayofyear(date) as day FROM __THIS__')
 
    sqlTrans1 = SQLTransformer(statement = 'SELECT today.station,today.date,today.latitude,today.longitude,today.elevation,today.tmax, yesterday.tmax AS yesterday_tmax FROM __THIS__ as today INNER JOIN __THIS__ as yesterday ON date_sub(today.date, 1) = yesterday.date AND today.station = yesterday.station')
    assemble_features = VectorAssembler(inputCols = ['latitude','longitude','elevation','day','yesterday_tmax'], outputCol = 'features')

    gbt = GBTRegressor(featuresCol = 'features', labelCol='tmax')
    pipeline = Pipeline(stages=[sqlTrans1,sqlTrans,assemble_features,gbt])
    weather_model = pipeline.fit(train)

    predictions = weather_model.transform(validation)
    #predictions.show()
    evaluator = RegressionEvaluator(labelCol = 'tmax', predictionCol = 'prediction', metricName = 'rmse')
    score = evaluator.evaluate(predictions)
    print("Root Mean Squared Error (RMSE) on test data = %g" % score)

    weather_model.write().overwrite().save(output)
Esempio n. 2
0
def main(spark):
    train_file = 'hdfs:/user/bm106/pub/project/cf_train.parquet'
    cf_train = spark.read.parquet(train_file)
    valid_file = 'hdfs:/user/bm106/pub/project/cf_validation.parquet'
    cf_valid = spark.read.parquet(valid_file)
    test_file = 'hdfs:/user/bm106/pub/project/cf_test.parquet'
    cf_test = spark.read.parquet(test_file)
    indexer1 = StringIndexer(inputCol="user_id", outputCol="user_index", handleInvalid='skip')
    indexer2 = StringIndexer(inputCol="track_id", outputCol="item_index", handleInvalid='skip')
    logTrans = SQLTransformer(statement='select user_index,item_index,log10(count+1) as count from __THIS__')
    dropTrans = SQLTransformer(statement='select * from __THIS__ where count>1')
    pipeline1 = Pipeline(stages=[indexer1, indexer2])
    pipeline2 = Pipeline(stages=[indexer1, indexer2, logTrans])
    pipeline3 = Pipeline(stages=[indexer1, indexer2, dropTrans])
    pre1 = pipeline1.fit(cf_train)
    pre2 = pipeline2.fit(cf_train)
    pre3 = pipeline3.fit(cf_train)

    pre1.transform(cf_train).write.parquet('hdfs:/user/sc6995/cf_train_index.parquet')
    pre1.transform(cf_valid).write.parquet('hdfs:/user/sc6995/cf_valid_index.parquet')
    pre1.transform(cf_test).write.parquet('hdfs:/user/sc6995/cf_test_index.parquet')

    pre2.transform(cf_train).write.parquet('hdfs:/user/sc6995/cf_train_logtrans.parquet')
    pre2.transform(cf_valid).write.parquet('hdfs:/user/sc6995/cf_valid_logtrans.parquet')
    pre2.transform(cf_test).write.parquet('hdfs:/user/sc6995/cf_test_logtrans.parquet')

    pre3.transform(cf_train).write.parquet('hdfs:/user/sc6995/cf_train_droptrans1.parquet')
Esempio n. 3
0
def main(inputs, model_file):
    data = spark.read.csv(inputs, schema=schema())
    train, validation = data.randomSplit([0.75, 0.25], seed=42)

    sql_transformer1 = SQLTransformer(statement=yes_tmax())
    sql_transformer2 = SQLTransformer(statement=ret_query())
    assemble_features = VectorAssembler(inputCols=[
        'latitude', 'longitude', 'elevation', 'dayofyear', 'yesterday_tmax'
    ],
                                        outputCol='features')
    regressor = GBTRegressor(featuresCol='features', labelCol='tmax')
    pipeline = Pipeline(stages=[
        sql_transformer1, sql_transformer2, assemble_features, regressor
    ])

    model = pipeline.fit(train)
    predictions = model.transform(validation)
    model.write().overwrite().save(model_file)

    r2_evaluator = RegressionEvaluator(predictionCol='prediction',
                                       labelCol='tmax',
                                       metricName='r2')
    r2 = r2_evaluator.evaluate(predictions)

    rmse_evaluator = RegressionEvaluator(predictionCol='prediction',
                                         labelCol='tmax',
                                         metricName='rmse')
    rmse = rmse_evaluator.evaluate(predictions)

    print("R-squared value : " + str(r2))
    print("RMSE value : " + str(rmse))
Esempio n. 4
0
def main_A(inputs):
    data = spark.read.option('encoding', 'UTF-8').csv(inputs,
                                                      schema=tmax_schema)
    ################ FEATURE ENGINEERING: add yesterday tmax #####################
    if USE_YTD_TEMP_FEATURE:
        syntax = """SELECT today.latitude,today.longitude,today.elevation,today.date,
                           today.tmax, yesterday.tmax AS yesterday_tmax
                    FROM __THIS__ as today
                    INNER JOIN __THIS__ as yesterday
                    ON date_sub(today.date, 1) = yesterday.date
                       AND today.station = yesterday.station"""
        sql_trans = SQLTransformer(statement=syntax)
        df = sql_trans.transform(data)
    #############################################################################
    df = data.withColumn('day_of_year', fn.dayofyear('date'))
    df = df.withColumn('year', fn.year('date'))

    df_long_lat = df[['station', 'longitude', 'latitude', 'tmax',
                      'year']].toPandas()
    count_year = df_long_lat['year'].value_counts().to_dict()

    # SELECT YEAR and DURATION
    YEAR_SELECTED = 2000
    YEAR_DURATION = 20
    df_long_lat = df_long_lat.loc[(df_long_lat['year'] > YEAR_SELECTED) & (
        df_long_lat['year'] < YEAR_SELECTED + YEAR_DURATION)]

    # UNCLUSTER plot by finding avg temperature (groupby same station and year)
    df_long_lat['avg_temp'] = df_long_lat.groupby(['station', 'year'
                                                   ])['tmax'].transform('mean')
    df_long_lat.drop_duplicates(subset=['station', 'year'], inplace=True)
    print(df_long_lat)

    world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
    geometry = [
        Point(xy)
        for xy in zip(df_long_lat['longitude'], df_long_lat['latitude'])
    ]

    df_long_lat = df_long_lat.drop(['longitude', 'latitude'], axis=1)
    crs = {'init': 'epsg:4326'}
    gdf = GeoDataFrame(df_long_lat, crs=crs, geometry=geometry)

    base = world.plot(color='white', edgecolor='black', figsize=(20, 12))
    gdf.plot(column='avg_temp',
             ax=base,
             marker='o',
             cmap='jet',
             markersize=15,
             legend=True,
             legend_kwds={
                 'label': "Temperature in Celcius",
                 'orientation': "horizontal"
             })
    plt.title('Distribution of Temperature between ' + str(YEAR_SELECTED) +
              " and " + str(YEAR_SELECTED + YEAR_DURATION))
    plt.savefig(inputs + "_" + str(YEAR_SELECTED) + "-" +
                str(YEAR_SELECTED + YEAR_DURATION))
Esempio n. 5
0
def main(inputs):
    data = spark.read.csv(inputs, schema=colour_schema)
    train, validation = data.randomSplit([0.75, 0.25])  #use seed here
    train = train.cache()
    validation = validation.cache()

    #creating a pipeline to predict RGB colours -> word
    rgb_assembler = VectorAssembler(inputCols=['R', 'G', 'B'],
                                    outputCol="features")
    #dataframe1 = rgb_assembler.transform(data)
    word_indexer = StringIndexer(inputCol="word",
                                 outputCol="target",
                                 handleInvalid="error",
                                 stringOrderType="frequencyDesc")
    classifier = MultilayerPerceptronClassifier(featuresCol="features",
                                                labelCol="target",
                                                layers=[3, 25, 25])
    rgb_pipeline = Pipeline(stages=[rgb_assembler, word_indexer, classifier])
    rgb_model = rgb_pipeline.fit(train)

    #creating an evaluator and score the validation data
    #model_train = rgb_model.transform(train)
    evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",
                                                  labelCol="target")
    rgb_validation = rgb_model.transform(validation)
    score = evaluator.evaluate(rgb_validation,
                               {evaluator.metricName: "accuracy"})

    print('Validation score for RGB model: %g' % (score, ))
    plot_predictions(rgb_model, 'RGB', labelCol='target')

    rgb_to_lab_query = rgb2lab_query(passthrough_columns=['word'])
    # creating a pipeline to predict RGB colours -> word; train and evaluate.
    sqlTrans = SQLTransformer(statement=rgb_to_lab_query)
    labdata = sqlTrans.transform(data)
    ltrain, lvalidation = labdata.randomSplit([0.75, 0.25])
    lrgb_assembler = VectorAssembler(inputCols=['labL', 'labA', 'labB'],
                                     outputCol="LAB")
    lword_indexer = StringIndexer(inputCol="word",
                                  outputCol="labTarget",
                                  handleInvalid="error",
                                  stringOrderType="frequencyDesc")
    lclassifier = MultilayerPerceptronClassifier(featuresCol="LAB",
                                                 labelCol="labTarget",
                                                 layers=[3, 25, 25])
    lrgb_pipeline = Pipeline(
        stages=[sqlTrans, lrgb_assembler, lword_indexer, lclassifier])
    lrgb_model = lrgb_pipeline.fit(ltrain)
    #lmodel_train = lrgb_model.transform(ltrain)
    lrgb_validation = lrgb_model.transform(lvalidation)
    print(lrgb_validation.show())
    evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",
                                                  labelCol="labTarget")
    lscore = evaluator.evaluate(lrgb_validation,
                                {evaluator.metricName: "accuracy"})

    print('Validation score for LAB model: %g' % (lscore, ))
    plot_predictions(lrgb_model, 'LAB', labelCol='word')
def sql_transformer_usecase():
    """
        通过sql方式实现对数据特征的转换
        "_THIS_" 代表的是输入数据对应的dataset
    """
    spark = getSparkSession()
    df = spark.createDataFrame([(0, 1.0, 3.0), (2, 2.0, 5.0)],
                               ["id", "v1", "v2"])
    sqlTrans = SQLTransformer(
        statement="SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__")
    sqlTrans.transform(df).show(truncate=False)
Esempio n. 7
0
def query(self, sql_expression):
    """
    Implements the transformations which are defined by SQL statement. Currently we only support
    SQL syntax like "SELECT ... FROM __THIS__ ..." where "__THIS__" represents the
    underlying table of the input dataframe.
    :param self: Spark Dataframe
    :param sql_expression: SQL expression.
    :return: Dataframe with columns changed by SQL statement.
    """
    sql_transformer = SQLTransformer(statement=sql_expression)
    return sql_transformer.transform(self)
Esempio n. 8
0
def main(inputs, model_file):
    data = spark.read.option('encoding', 'UTF-8').csv(inputs,
                                                      schema=tmax_schema)
    ################ FEATURE ENGINEERING: add yesterday tmax #####################
    if USE_YTD_TEMP_FEATURE:
        syntax = """SELECT today.latitude,today.longitude,today.elevation,today.date,
                           today.tmax, yesterday.tmax AS yesterday_tmax
                    FROM __THIS__ as today
                    INNER JOIN __THIS__ as yesterday
                    ON date_sub(today.date, 1) = yesterday.date
                       AND today.station = yesterday.station"""
        sql_trans = SQLTransformer(statement=syntax)
        data = sql_trans.transform(data)
    #############################################################################
    data = data.withColumn('day_of_year', fn.dayofyear('date'))
    train, validation = data.randomSplit([0.75, 0.25])
    train = train.cache()
    validation = validation.cache()

    if USE_YTD_TEMP_FEATURE:
        train_feature_assembler = VectorAssembler(inputCols=[
            'yesterday_tmax', 'day_of_year', 'latitude', 'longitude',
            'elevation'
        ],
                                                  outputCol='features')
    else:
        train_feature_assembler = VectorAssembler(
            inputCols=['day_of_year', 'latitude', 'longitude', 'elevation'],
            outputCol='features')

    ############# DIFFERENT ML ALGORITHMS TO BE USED ####################
    # classifier = GeneralizedLinearRegression(featuresCol = 'features', labelCol='tmax' )
    # classifier = GBTRegressor( maxDepth=5,featuresCol = 'features', labelCol='tmax' )
    classifier = RandomForestRegressor(numTrees=7,
                                       maxDepth=8,
                                       featuresCol='features',
                                       labelCol='tmax')
    #####################################################################

    train_pipeline = Pipeline(stages=[train_feature_assembler, classifier])
    weather_model = train_pipeline.fit(train)

    prediction = weather_model.transform(validation)
    # print(prediction.show())
    evaluator = RegressionEvaluator(predictionCol="prediction",
                                    labelCol='tmax',
                                    metricName='r2')  #rmse
    score = evaluator.evaluate(prediction)
    print('Validation score for weather model: %g' % (score, ))

    weather_model.write().overwrite().save(model_file)
Esempio n. 9
0
def main(inputs):

    data = spark.read.csv(inputs, schema=colour_schema)
    train, validation = data.randomSplit([0.75, 0.25])
    train = train.cache()
    validation = validation.cache()

    # TODO: create a pipeline to predict RGB colours -> word

    rgb_assembler = VectorAssembler(inputCols=['R', 'G', 'B'],
                                    outputCol='features')
    word_indexer = StringIndexer(inputCol='word', outputCol='new_word')
    classifier = MultilayerPerceptronClassifier(labelCol="new_word",
                                                layers=[3, 30, 11])
    rgb_pipeline = Pipeline(stages=[rgb_assembler, word_indexer, classifier])
    rgb_model = rgb_pipeline.fit(train)

    # TODO: create an evaluator and score the validation data

    rgb_validation = rgb_model.transform(validation)
    # rgb_validation.show()
    plot_predictions(rgb_model, 'RGB', labelCol='word')
    vali_evaluator = MulticlassClassificationEvaluator(
        predictionCol="prediction", labelCol='new_word')
    score = vali_evaluator.evaluate(rgb_validation)
    print('Validation score for RGB model: %g' % (score, ))

    # TODO: create a pipeline RGB colours -> LAB colours -> word; train and evaluate.

    rgb_to_lab_query = rgb2lab_query(passthrough_columns=['word'])
    sql_transformer = SQLTransformer(statement=rgb_to_lab_query)

    new_assembler = VectorAssembler(inputCols=['labL', 'labA', 'labB'],
                                    outputCol='features')
    new_pipeline = Pipeline(
        stages=[sql_transformer, new_assembler, word_indexer, classifier])
    new_training = sql_transformer.transform(train)
    new_model = new_pipeline.fit(new_training)
    new_validation = new_model.transform(validation)

    #new_validation.show()

    new_vali_evaluator = MulticlassClassificationEvaluator(
        predictionCol='prediction', labelCol='new_word')
    new_score = new_vali_evaluator.evaluate(new_validation)
    print('Validation score for LAB model:', new_score)
    print('Validation score for LAB model:', new_score)
    print('Validation score for LAB model:', new_score)

    plot_predictions(new_model, 'LAB', labelCol="word")
Esempio n. 10
0
def main(inputs):
    data = spark.read.csv(inputs, schema=colour_schema)
    train, validation = data.randomSplit([0.75, 0.25])
    train = train.cache()
    validation = validation.cache()
    
    word_indexer = StringIndexer(inputCol = "word", outputCol = "labelCol", handleInvalid = 'error')
    classifier = MultilayerPerceptronClassifier(maxIter = 400, layers = [3, 30, 11], blockSize = 1, seed = 123, labelCol = "labelCol")
    # TODO: create a pipeline to predict RGB colours -> word
    rgb_assembler = VectorAssembler(inputCols = ['R', 'G', 'B'], outputCol = "features")
    
    classifier = MultilayerPerceptronClassifier(maxIter = 400, layers = [3, 30, 11], blockSize = 1, seed = 123, labelCol = "labelCol")
    rgb_pipeline = Pipeline(stages=[rgb_assembler, word_indexer, classifier])
    rgb_model = rgb_pipeline.fit(train)
    
    # TODO: create an evaluator and score the validation data
    evaluator = MulticlassClassificationEvaluator(labelCol = "labelCol" , predictionCol = "prediction")
    
    predictions = rgb_model.transform(validation)
    score = evaluator.evaluate(predictions)
    plot_predictions(rgb_model, 'RGB', labelCol='word')
    print('Validation score for RGB model: %g' % (score, ))
    
    rgb_to_lab_query = rgb2lab_query(passthrough_columns=["word"])
    sqlTrans = SQLTransformer(statement = rgb_to_lab_query)
    # TODO: create a pipeline to predict RGB colours -> word; train and evaluate.
    lab_assembler = VectorAssembler(inputCols = ['labL', 'labA', 'labB'], outputCol = "features")
    lab_pipeline = Pipeline(stages=[sqlTrans,lab_assembler, word_indexer, classifier])
    lab_model = lab_pipeline.fit(train)

    predictions_lab = lab_model.transform(validation)
    score_lab = evaluator.evaluate(predictions_lab)
    plot_predictions(lab_model, 'LAB', labelCol='word')
    print('Validation score for LAB model: %g' % (score_lab, ))
Esempio n. 11
0
def main(inputs, model_file):
    sensor_data_df = spark.read.format("org.apache.spark.sql.cassandra").options(table=sensor_data_table,
                                                                                 keyspace=keyspace).load()
    # creating a ML pipeline

    sensor_data_df = sensor_data_df.select(sensor_data_df['datetime'],
                                         sensor_data_df['latitude'],
                                         sensor_data_df['longitude'],
                                         sensor_data_df['message_code_id'],
                                         sensor_data_df['sensor_reading'],
                                         sensor_data_df['sensor_name']).orderBy(sensor_data_df['datetime'].asc())
    train_set, validation_set = sensor_data_df.randomSplit([0.75, 0.25])
    train_set.catch()
    validation_set.catch()
    sql_transformer_statement = "SELECT latitude, longitude, sensor_name, sensor_reading, message_code_id" \
                                 "FROM __THIS__"

    sql_transformer = SQLTransformer(statement=sql_transformer_statement)
    assemble_features = VectorAssembler(inputCols=['latitude', 'longitude', 'sensor_name', 'sensor_reading']
                                        , outputCol= 'features')
    classifier = GBTRegressor(featuresCol='features', labelCol='message_code_id')
    pipeline = Pipeline(stages=[sql_transformer, assemble_features, classifier])
    model = pipeline.fit(train_set)

    predictions = model.tranform(validation_set)
    predictions.show()

    r2_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='message_code_id', metricName='r2')
    rmse_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='message_code_id', metricName='rmse')
    r2_score = r2_evaluator.evaluate(predictions)
    rmse_score = rmse_evaluator.evaluate(predictions)
    print('r2 validation score : ', r2_score)
    print('rmse validation score: ', rmse_score)
Esempio n. 12
0
def make_weather_trainers(trainRatio, estimator_gridbuilders, metricName=None):
    """Construct a list of TrainValidationSplit estimators for weather data
       where `estimator_gridbuilders` is a list of (Estimator, ParamGridBuilder) tuples
       and 0 < `trainRatio` <= 1 determines the fraction of rows used for training.
       The RegressionEvaluator will use a non-default `metricName`, if specified.
    """
    feature_cols = ['latitude', 'longitude', 'elevation', 'doy']
    column_names = dict(featuresCol="features",
                        labelCol="tmax",
                        predictionCol="tmax_pred")

    getDOY = doy_query()
    sqlTrans = SQLTransformer(statement=getDOY)

    feature_assembler = VectorAssembler(inputCols=feature_cols,
                                        outputCol=column_names["featuresCol"])
    ev = (RegressionEvaluator().setLabelCol(
        column_names["labelCol"]).setPredictionCol(
            column_names["predictionCol"]))
    if metricName:
        ev = ev.setMetricName(metricName)
    tvs_list = []
    for est, pgb in estimator_gridbuilders:
        est = est.setParams(**column_names)

        pl = Pipeline(stages=[sqlTrans, feature_assembler, est])

        paramGrid = pgb.build()
        tvs_list.append(
            TrainValidationSplit(estimator=pl,
                                 estimatorParamMaps=paramGrid,
                                 evaluator=ev,
                                 trainRatio=trainRatio))
    return tvs_list
Esempio n. 13
0
    def runTest(self):
        document_assembler = DocumentAssembler() \
            .setInputCol("text") \
            .setOutputCol("document")
        sentence_detector = SentenceDetector() \
            .setInputCols(["document"]) \
            .setOutputCol("sentence")
        tokenizer = Tokenizer() \
            .setInputCols(["sentence"]) \
            .setOutputCol("token")
        glove = WordEmbeddingsModel.pretrained() \
            .setInputCols(["sentence", "token"]) \
            .setOutputCol("embeddings")
        sentence_embeddings = SentenceEmbeddings() \
            .setInputCols(["sentence", "embeddings"]) \
            .setOutputCol("sentence_embeddings") \
            .setPoolingStrategy("AVERAGE")
        embeddings_finisher = EmbeddingsFinisher() \
            .setInputCols("sentence_embeddings") \
            .setOutputCols("sentence_embeddings_vectors") \
            .setOutputAsVector(True)
        explode_vectors = SQLTransformer(
            statement=
            "SELECT EXPLODE(sentence_embeddings_vectors) AS features, * FROM __THIS__"
        )
        kmeans = KMeans().setK(2).setSeed(1).setFeaturesCol("features")

        pipeline = Pipeline(stages=[
            document_assembler, sentence_detector, tokenizer, glove,
            sentence_embeddings, embeddings_finisher, explode_vectors, kmeans
        ])

        model = pipeline.fit(self.data)
        model.transform(self.data).show()
def train_model(model_file, inputs): 
    # get the data
    train_tmax = spark.read.csv(inputs, schema=tmax_schema)
    train, validation = train_tmax.randomSplit([0.75, 0.25], seed=110)
   
    #query ="SELECT station,date, dayofyear(date) as doy, latitude, longitude, elevation,tmax  FROM __THIS__"
    
    query = """SELECT today.station, dayofyear(today.date) as doy, today.latitude, today.longitude, today.elevation, today.tmax, yesterday.tmax AS yesterday_tmax FROM __THIS__ as today INNER JOIN __THIS__ as yesterday ON date_sub(today.date, 1) = yesterday.date AND today.station = yesterday.station"""
    
    #weather_assembler = VectorAssembler(inputCols=['latitude','longitude','elevation', 'doy'], outputCol="features")
    weather_assembler = VectorAssembler(inputCols=['latitude','longitude','elevation', 'doy', 'yesterday_tmax'], outputCol="features")
    regressor =  GBTRegressor(maxIter=50,maxDepth=5,featuresCol="features",labelCol="tmax")
    transquery = SQLTransformer(statement=query)
    pipeline = Pipeline(stages=[transquery,weather_assembler,regressor])
    model = pipeline.fit(train)
    model.write().overwrite().save(model_file)
 
    # use the model to make predictions
    predictions = model.transform(validation)
    #predictions.show()
    
    # evaluate the predictions
    r2_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='tmax',
            metricName='r2')
    r2 = r2_evaluator.evaluate(predictions)
    
    rmse_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='tmax',
            metricName='rmse')
    rmse = rmse_evaluator.evaluate(predictions)

    print('r2 =', r2)
    print('rmse =', rmse)
Esempio n. 15
0
def main(inputs, model_file):
    data = spark.read.csv(inputs, schema=tmax_schema)
    data.registerTempTable('yesterday')
    #wthr_query = """SELECT  dayofyear(date) as dayofyr, latitude, longitude, elevation,tmax  FROM __THIS__"""
    wthr_query = """SELECT dayofyear(today.date) as dayofyr,today.latitude, today.longitude, today.elevation, today.tmax, yesterday.tmax as yesterday_tmax FROM __THIS__ as today INNER JOIN __THIS__ as yesterday ON date_sub(today.date, 1) = yesterday.date AND today.station = yesterday.station"""
    
    train, validation = data.randomSplit([0.75, 0.25])
    train = train.cache()
    validation = validation.cache()

    #define the assembler and regressor
    assembler = VectorAssembler(inputCols=["latitude", "longitude", "elevation", "dayofyr" ], outputCol="features")
    regressor = RandomForestRegressor(maxDepth=10, minInstancesPerNode=2, minInfoGain=0.5, labelCol = "tmax")
    trans_query = SQLTransformer(statement = wthr_query)
    
    #define pipeline and model
    wthr_pipeline = Pipeline(stages=[trans_query, assembler, regressor])
    wthr_model = wthr_pipeline.fit(train)
 
    #define the regression evaluator
    evaluator = RegressionEvaluator(labelCol="tmax", predictionCol="prediction")
    predictions = wthr_model.transform(validation)
    err = evaluator.evaluate(predictions)
    wthr_model.write().overwrite().save(model_file)
    print('Root Mean Square Error(rmse) : ' + str(err))
Esempio n. 16
0
def test_simple_csv_loader_pipeline(spark_session: SparkSession) -> None:
    # Arrange
    data_dir: Path = Path(__file__).parent.joinpath('./')
    flights_path: str = f"file://{data_dir.joinpath('flights.csv')}"

    schema = StructType([])

    df: DataFrame = spark_session.createDataFrame(
        spark_session.sparkContext.emptyRDD(), schema)

    # noinspection SqlDialectInspection,SqlNoDataSourceInspection
    spark_session.sql("DROP TABLE IF EXISTS default.flights")

    # Act
    # parameters = Dict[str, Any]({
    # })

    stages: List[Union[Estimator, Transformer]] = [
        FrameworkCsvLoader(
            view="flights",
            path_to_csv=flights_path
        ),
        SQLTransformer(statement="SELECT * FROM flights"),
    ]

    pipeline: Pipeline = Pipeline(stages=stages)

    transformer = pipeline.fit(df)
    result_df: DataFrame = transformer.transform(df)

    # Assert
    result_df.show()

    assert result_df.count() > 0
Esempio n. 17
0
def get_glm_explain_stages(predictions_view: str,
                           coefficients_view: str,
                           label_column: str,
                           family: str = 'tweedie',
                           link: str = 'identity',
                           variance_power: float = 0.0,
                           link_power: float = 1.0) -> List:
    link_function_type = resolve_link_function(family, link, link_power)
    stages = [
        OneHotDecoder(oheSuffix="_OHE",
                      idxSuffix="_IDX",
                      unknownSuffix="Unknown"),
        SQLTransformer(
            statement=
            f"CREATE OR REPLACE TEMPORARY VIEW {predictions_view} AS SELECT * from __THIS__"
        ),
        GLMExplainTransformer(predictionView=predictions_view,
                              coefficientView=coefficients_view,
                              linkFunctionType=link_function_type,
                              label=label_column,
                              nested=True,
                              calculateSum=True,
                              family=family,
                              variancePower=variance_power,
                              linkPower=link_power)
    ]
    return stages
Esempio n. 18
0
def main(inputs):
    data = spark.read.csv(inputs, schema=colour_schema)
    train, validation = data.randomSplit([0.75, 0.25])
    train = train.cache()
    validation = validation.cache()
    
    # TODO: create a pipeline to predict RGB colours -> word
    rgb_assembler = VectorAssembler(inputCols=['R','G','B'], outputCol='features')
    word_indexer = StringIndexer(inputCol='word', outputCol='label')
    classifier = MultilayerPerceptronClassifier(layers=[3, 30, 11])
    rgb_pipeline = Pipeline(stages=[rgb_assembler, word_indexer, classifier])
    rgb_model = rgb_pipeline.fit(train)

    # TODO: create an evaluator and score the validation data
    predictions = rgb_model.transform(validation)
    rgb_evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label', metricName='f1')
    score = rgb_evaluator.evaluate(predictions)
    plot_predictions(rgb_model, 'RGB', labelCol='word')
    print('Validation score for RGB model: %g' % (score, ))
    
    rgb_to_lab_query = rgb2lab_query(passthrough_columns=['word'])

    
    # TODO: create a pipeline RGB colours -> LAB colours -> word; train and evaluate.
    lab = SQLTransformer(statement=rgb_to_lab_query)
    lab_assembler = VectorAssembler(inputCols=['labL','labA','labB'], outputCol='features')
    lab_pipeline = Pipeline(stages=[lab, lab_assembler, word_indexer, classifier])
    lab_model = lab_pipeline.fit(train)
    plot_predictions(lab_model, 'LAB', labelCol='word')
    lab_predictions = lab_model.transform(validation)
    lab_score = rgb_evaluator.evaluate(lab_predictions)
    print('Validation score for LAB model:', lab_score)
Esempio n. 19
0
def model_train(input,model_path):
    tmax_schema = types.StructType([
        types.StructField('station', types.StringType()),
        types.StructField('date', types.DateType()),
        types.StructField('latitude', types.FloatType()),
        types.StructField('longitude', types.FloatType()),
        types.StructField('elevation', types.FloatType()),
        types.StructField('tmax', types.FloatType()),
    ])
    data = spark.read.csv(input,schema= tmax_schema)
    train, validation = data.randomSplit([0.75,0.25])
    train = train.cache()
    validation = validation.cache()

    sql_query = """SELECT today.latitude, today.longitude, today.elevation, dayofyear(today.date) AS dy,yesterday.tmax AS yesterday_tmax, today.tmax
                     FROM __THIS__ as today
               INNER JOIN __THIS__ as yesterday
                       ON date_sub(today.date, 1) = yesterday.date
                      AND today.station = yesterday.station"""
    transformer = SQLTransformer(statement=sql_query)
    assemble_features = VectorAssembler(inputCols=['latitude','longitude','elevation','dy','yesterday_tmax'],outputCol='features')
    regressor = DecisionTreeRegressor(featuresCol='features',labelCol='tmax')
    weather_pipeline = Pipeline(stages=[transformer,assemble_features,regressor])
    model = weather_pipeline.fit(train)
    model.write().overwrite().save(model_path)

    prediction = model.transform(validation)
    #Scoring the model
    evaluator = RegressionEvaluator(predictionCol='prediction',labelCol='tmax',metricName='rmse')
    score = evaluator.evaluate(prediction)
    print("Score of the weather model is",score)
Esempio n. 20
0
def add_data_cleaner():
    '''
    OUTPUT:
    stages - (list) list of transformer to be used as 'stages' argument of
        pyspark Pipeline() constructor

    DESCRIPTION:
    This is a subroutine of create_preprocess_pipeline() function.
    Stages added by this function will clean raw pyspark dataframe for next
        steps.
    '''
    stages = []  # pipeline stage list

    # filter rows with userId==Null or sessionId==Null, just in case
    sqlTrans = SQLTransformer(statement="SELECT *\
                              FROM __THIS__\
                              WHERE userId IS NOT NULL\
                              AND sessionId IS NOT NULL")
    stages.append(sqlTrans)

    # drop empty user id row
    sqlTrans = SQLTransformer(statement="SELECT *\
                              FROM __THIS__\
                              WHERE userId != ''")
    stages.append(sqlTrans)

    # drop 'Logged-Out' state and 'Guest' state
    sqlTrans = SQLTransformer(statement="SELECT *\
                              FROM __THIS__\
                              WHERE auth != 'Logged Out' AND auth != 'Guest'")
    stages.append(sqlTrans)

    # exclude rows with user who has only one song play or less
    sqlTrans = SQLTransformer(statement=" \
        SELECT * \
        FROM __THIS__ \
        WHERE userId NOT IN ( \
            SELECT DISTINCT userId \
            FROM \
            (SELECT userId, page, \
                COUNT(CASE WHEN page = 'NextSong' THEN page END) \
                OVER(PARTITION BY userId) AS songCount \
            FROM __THIS__) AS user_page_count \
            WHERE user_page_count.songCount < 2)")
    stages.append(sqlTrans)

    return stages
Esempio n. 21
0
def construct_pipeline():
    feature_columns = [
        "season",
        "yr",
        "mnth",
        "holiday",
        "weekday",
        "workingday",
        "weathersit",
        "temp",
        "atemp",
        "hum",
        "windspeed"
    ]

    sql_transformer = SQLTransformer(statement="""
        SELECT
            cast(season as int),
            cast(yr as int),
            cast(mnth as int),
            cast(holiday as int),
            cast(weekday as int),
            cast(workingday as int),
            cast(weathersit as int),
            cast(temp as double),
            cast(atemp as double),
            cast(hum as double),
            cast(windspeed as double),
            cast(cnt as int) as label
        FROM __THIS__
    """)

    assembler = VectorAssembler().setInputCols(feature_columns[:-1]).setOutputCol("features")

    feature_label_tf = SQLTransformer(statement="""
        SELECT features, label
        FROM __THIS__
    """)

    pipeline = Pipeline(stages=[
        sql_transformer,
        assembler,
        feature_label_tf,
        LinearRegression()
    ])

    return pipeline
def main(input, model_file):
    tmax_schema = types.StructType([
        types.StructField('station', types.StringType()),
        types.StructField('date', types.DateType()),
        types.StructField('latitude', types.FloatType()),
        types.StructField('longitude', types.FloatType()),
        types.StructField('elevation', types.FloatType()),
        types.StructField('tmax', types.FloatType()),
    ])
    data = spark.read.csv(input, schema=tmax_schema)
    train, validation = data.randomSplit([0.75, 0.25], seed=123)
    train = train.cache()
    validation = validation.cache()
    y_tmax = SQLTransformer(
        statement=
        "SELECT today.station,today.latitude,today.longitude,today.elevation,today.date,today.tmax,yesterday.tmax AS yesterday_tmax FROM __THIS__ as today INNER JOIN __THIS__ as yesterday ON date_sub(today.date, 1) = yesterday.date AND today.station = yesterday.station"
    )
    getvalues = SQLTransformer(
        statement=
        "SELECT station,latitude,longitude,elevation,dayofyear(date) AS dayofyear,tmax,yesterday_tmax from __THIS__"
    )

    assemble_features = VectorAssembler(inputCols=[
        'latitude', 'longitude', 'elevation', 'dayofyear', 'yesterday_tmax'
    ],
                                        outputCol='features')
    classifier = GBTRegressor(featuresCol='features', labelCol='tmax')
    pipeline = Pipeline(
        stages=[y_tmax, getvalues, assemble_features, classifier])

    model = pipeline.fit(train)
    predictions = model.transform(validation)

    r2_evaluator = RegressionEvaluator(predictionCol='prediction',
                                       labelCol='tmax',
                                       metricName='r2')
    r2 = r2_evaluator.evaluate(predictions)
    print('-----------------------------------')
    print('r2: %g' % (r2, ))
    print('-----------------------------------')
    rmse_evaluator = RegressionEvaluator(predictionCol='prediction',
                                         labelCol='tmax',
                                         metricName='rmse')
    rmse = rmse_evaluator.evaluate(predictions)
    print('rmse: %g' % (rmse, ))
    model.write().overwrite().save(model_file)
Esempio n. 23
0
def main(inputs):
    data = spark.read.csv(inputs, header=True, schema=colour_schema)
    lab_query = rgb2lab_query(passthrough_columns=['labelword'])

    # TODO: actually build the components for the pipelines, and the pipelines.
    indexer = StringIndexer(inputCol="labelword",
                            outputCol="labelCol",
                            handleInvalid='error')

    rgb_assembler = VectorAssembler(inputCols=['R', 'G', 'B'],
                                    outputCol="features")
    lab_assembler = VectorAssembler(inputCols=['lL', 'lA', 'lB'],
                                    outputCol="features")

    forest = RandomForestClassifier(numTrees=22,
                                    maxDepth=10,
                                    labelCol="labelCol",
                                    seed=42)
    mlp = MultilayerPerceptronClassifier(maxIter=400,
                                         layers=[3, 16, 11],
                                         blockSize=1,
                                         seed=123,
                                         labelCol="labelCol")

    sqlTrans = SQLTransformer(statement=lab_query)

    models = [
        ('RGB-forest', Pipeline(stages=[indexer, rgb_assembler, forest])),
        ('LAB-forest',
         Pipeline(stages=[sqlTrans, indexer, lab_assembler, forest])),
        ('RGB-MLP', Pipeline(stages=[indexer, rgb_assembler, mlp])),
        ('LAB-MLP', Pipeline(stages=[sqlTrans, indexer, lab_assembler, mlp])),
    ]

    # TODO: need an evaluator
    evaluator = MulticlassClassificationEvaluator(labelCol="labelCol",
                                                  predictionCol="prediction")

    # TODO: split data into training and testing
    train, test = data.randomSplit([0.75, 0.25])
    train = train.cache()
    test = test.cache()
    score_dict = dict()
    for label, pipeline in models:
        # TODO: fit the pipeline to create a model
        model = pipeline.fit(train)

        # Output a visual representation of the predictions we're
        # making: uncomment when you have a model working
        plot_predictions(model, label)

        # TODO: predict on the test data
        predictions = model.transform(test)

        # calculate a score
        score = evaluator.evaluate(predictions)
        score_dict[label] = score
    return score_dict
def main(inputs):

    # Read the CSV File
    df = spark.read.csv(inputs, schema=colour_schema)

    # Total label count
    label_num = df.select('word').distinct().count()

    # Split the dataset. Make 75% as training set and the remaining 25% as validation set
    train, validation = df.randomSplit([0.75, 0.25])
    train = train.cache()
    validation = validation.cache()

    # Creating pipeline
    rgb_assembler = VectorAssembler(inputCols=["R", "G", "B"],
                                    outputCol="features")
    word_indexer = StringIndexer(inputCol="word",
                                 outputCol="label",
                                 handleInvalid="error")
    classifier_mpc = MultilayerPerceptronClassifier(layers=[3, 250, label_num])

    # Transformer for the lab pipeline
    rgb_to_lab_query = rgb2lab_query(passthrough_columns=['word'])
    sqlTrans = SQLTransformer(statement=rgb_to_lab_query)
    lab_assembler = VectorAssembler(inputCols=["labL", "labA", "labB"],
                                    outputCol="features")

    # TODO: create a pipeline to predict RGB colours -> word; train and evaluate.

    #  pipeline to predict RGB colours
    rgb_pipeline = Pipeline(
        stages=[rgb_assembler, word_indexer, classifier_mpc])
    lab_pipeline = Pipeline(
        stages=[sqlTrans, lab_assembler, word_indexer, classifier_mpc])

    # Train the model
    rgb_model = rgb_pipeline.fit(train)
    lab_model = lab_pipeline.fit(train)

    # Transform the validation set
    predictions_rgb = rgb_model.transform(validation)
    predictions_lab = lab_model.transform(validation)

    # TODO: create an evaluator and score the validation data

    # Create a Multiclass Classification Evaluator
    evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")

    # Evaluate it on validation data
    score_rgb = evaluator.evaluate(predictions_rgb)
    score_lab = evaluator.evaluate(predictions_lab)

    plot_predictions(rgb_model, 'RGB', labelCol='word')
    plot_predictions(lab_model, 'LAB', labelCol='word')

    # Print the validation scores
    print('Validation score for RGB model: %g' % (score_rgb, ))
    print('Validation score for LAB model: %g' % (score_lab, ))
def main(inputs, outputs):
    data = spark.read.csv(inputs, schema=tmax_schema)
    train, validation = data.randomSplit([0.75, 0.25])
    train = train.cache()
    validation = validation.cache()
    query1 = """SELECT *,  dayofyear( date ) AS day
                FROM __THIS__ """
    # TODO: create a pipeline to predict Latitude, Longtitude, Elevation point -> tmax

    query = """SELECT today.station as station ,
                    today.latitude as latitude,
                    today.longitude as longitude,
                    today.elevation as elevation, 
                    dayofyear( today.date ) AS day, 
                    today.tmax, 
                    yesterday.tmax AS yesterday_tmax 
                FROM __THIS__ as today 
                    INNER JOIN __THIS__ as yesterday 
                        ON date_sub(today.date, 1) = yesterday.date AND today.station = yesterday.station"""
    sqlTrans = SQLTransformer(statement=query)

    lle_assembler = VectorAssembler(inputCols=[
        "latitude", "longitude", "elevation", "day", "tmax", "yesterday_tmax"
    ],
                                    outputCol="features")
    tmax_indexer = StringIndexer(inputCol="station",
                                 outputCol="label",
                                 handleInvalid='error')
    regressor = GBTRegressor(featuresCol='features',
                             labelCol='tmax',
                             maxIter=100)

    pipeline = Pipeline(
        stages=[sqlTrans, lle_assembler, tmax_indexer, regressor])
    model = pipeline.fit(train)

    # TODO: create an evaluator and score the validation
    predictions = model.transform(validation)
    r2_evaluator = RegressionEvaluator(predictionCol='prediction',
                                       labelCol='tmax',
                                       metricName='r2')
    r2 = r2_evaluator.evaluate(predictions)
    rmse_evaluator = RegressionEvaluator(predictionCol='prediction',
                                         labelCol='tmax',
                                         metricName='rmse')
    rmse = rmse_evaluator.evaluate(predictions)
    model.write().overwrite().save(outputs)

    print('r2 =', r2)
    print('rmse =', rmse)
Esempio n. 26
0
def add_label_maker(stages):
    '''
    INPUT:
    stages - (list) list of transformer to be used as 'stages' argument of
        pyspark Pipeline() constructor
                It should be an output of 'create_data_cleaner()' function.

    OUTPUT:
    stages - (list) list of transformer to be used as 'stages' argument of
        pyspark Pipeline() constructor

    DESCRIPTION:
    This is a subroutine of create_preprocess_pipeline() function.
    Stages added by this function will make label column in target pyspark
        dataframe.
    It also drops rows which the label column directly depends on.
    '''
    # 'churn_event'
    # add a column to store churn event as integer
    sqlTrans = SQLTransformer(statement=" \
        SELECT *, \
            CASE WHEN page = 'Cancellation Confirmation'\
            THEN 1 ELSE 0 END AS churn_event \
        FROM __THIS__")
    stages.append(sqlTrans)

    # 'Churn'
    # add a column to store cumulative sum of churn flag
    sqlTrans = SQLTransformer(statement=" \
        SELECT *, \
            MAX(churn_event) OVER ( \
                PARTITION BY userId \
            ) AS Churn \
        FROM __THIS__")
    stages.append(sqlTrans)

    return stages
Esempio n. 27
0
def deriveNewMethod(df):
    from pyspark.ml.feature import SQLTransformer

    # 把空值去掉
    df = df.filter(df['area'].isNotNull())
    df = df.filter(df['price'].isNotNull())
    df = df.filter(df['room_num'].isNotNull())

    #df = df.filter(df['area'] !='NULL')
    #df = df.filter(df['price'] !='NULL')
    #df = df.filter(df['room_num'] !='NULL')

    # 去除点值为0的行
    df = df.filter(df['area'] != 0)
    df = df.filter(df['room_num'] != 0)

    df = df.select('*',
                   df['area'].cast('Float').alias('tmp_name')).drop('area')
    df = df.withColumnRenamed('tmp_name', 'area')
    df = df.select('*',
                   df['price'].cast('Float').alias('tmp_name')).drop('price')
    df = df.withColumnRenamed('tmp_name', 'price')
    df = df.select(
        '*', df['room_num'].cast('Float').alias('tmp_name')).drop('room_num')
    df = df.withColumnRenamed('tmp_name', 'room_num')

    print(df.dtypes)

    sqlTransform = SQLTransformer(
        statement=
        'SELECT *,(area/room_num) AS one_room_area, (price/area) AS one_area_price FROM __THIS__'
    )
    df = sqlTransform.transform(df)

    # spark.stop()

    return df
Esempio n. 28
0
def main(inputs, model_file):
    
    # get the data
    test_tmax = spark.read.csv(inputs, schema=tmax_schema)
    train, validation = test_tmax.randomSplit([0.75, 0.25])

    # with yesterday feature, the code is as following:
    sql_query = 'SELECT today.latitude as latitude, today.longitude as longitude, today.elevation as elevation, dayofyear(today.date) as dayofyear, today.tmax as tmax, yesterday.tmax AS y_tmax FROM __THIS__ as today INNER JOIN __THIS__ as yesterday ON date_sub(today.date, 1) = yesterday.date AND today.station = yesterday.station'
    sql_transformer = SQLTransformer(statement=sql_query)
  
  
    assembler = VectorAssembler(inputCols=['latitude', 'longitude', 'elevation', 'dayofyear', 'y_tmax'], outputCol='features')
    classifier = GBTRegressor(featuresCol='features', labelCol='tmax')
    pipelineModel = Pipeline(stages=[sql_transformer, assembler, classifier])


#     # without yesterday feature, the code is as following:
#     sql_query = 'SELECT today.latitude as latitude, today.longitude as longitude, today.elevation as elevation, dayofyear(today.date) as dayofyear, today.tmax as tmax FROM __THIS__ as today INNER JOIN __THIS__ as yesterday ON date_sub(today.date, 1) = yesterday.date AND today.station = yesterday.station'
#  
#     sql_transformer = SQLTransformer(statement=sql_query)
#     assembler = VectorAssembler(inputCols=['latitude', 'longitude', 'elevation', 'dayofyear'], outputCol='features')
#  
#     classifier = GBTRegressor(featuresCol='features', labelCol='tmax')
#     pipelineModel = Pipeline(stages=[sql_transformer, assembler, classifier])


    # load the model
    model = pipelineModel.fit(train)
   # model = PipelineModel.load(train)
    
    # use the model to make predictions
    predictions = model.transform(validation)
    
    #predictions.show()    
    
    # evaluate the predictions
    r2_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='tmax', metricName='r2')
    r2 = r2_evaluator.evaluate(predictions)
   
    rmse_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='tmax', metricName='rmse')
    rmse = rmse_evaluator.evaluate(predictions)

    print('r2 =', r2)
    print('rmse =', rmse)

    # If you used a regressor that gives .featureImportances, maybe have a look...
    #print(model.stages[-1].featureImportances)

    model.write().overwrite().save(model_file)
Esempio n. 29
0
def main(inputs, output):
    tmax_schema = types.StructType([
        types.StructField('station', types.StringType()),
        types.StructField('date', types.DateType()),
        types.StructField('latitude', types.FloatType()),
        types.StructField('longitude', types.FloatType()),
        types.StructField('elevation', types.FloatType()),
        types.StructField('tmax', types.FloatType()),
    ])
    data = spark.read.csv(inputs, schema=tmax_schema)

    query = "SELECT t.station AS station, t.date AS date, t.day AS day, t.latitude AS latitude, t.longitude AS longitude, t.elevation AS elevation, t.tmax AS tmax, y.tmax AS tmax_yesterday FROM (SELECT station, date, latitude, longitude, elevation, tmax, DAYOFYEAR(date) AS day, date_sub(date,1) AS date_yesterday FROM __THIS__) t, (SELECT station, date, latitude, longitude, elevation, tmax, DAYOFYEAR(date) AS day, date_sub(date,1) AS date_yesterday FROM __THIS__) y WHERE t.date = y.date_yesterday AND t.station = y.station"
    sqlTrans = SQLTransformer(statement=query)
    train, validation = data.randomSplit([0.75, 0.25])
    train = train.cache()
    # train.show()
    validation = validation.cache()
    assembler = VectorAssembler(inputCols=[
        "latitude", "longitude", "elevation", "day", "tmax_yesterday"
    ],
                                outputCol="features")
    classifier = GBTRegressor(featuresCol='features', labelCol='tmax')
    pipeline = Pipeline(stages=[sqlTrans, assembler, classifier])
    model = pipeline.fit(train)
    predictions = model.transform(validation)
    predictions.show()

    r2_evaluator = RegressionEvaluator(predictionCol='prediction',
                                       labelCol='tmax',
                                       metricName='r2')
    r2 = r2_evaluator.evaluate(predictions)

    print("R-square for the validation data is: " + str(r2))
    model.write().overwrite().save(output)

    r2 = r2_evaluator.evaluate(model.transform(train))
    print("R-square for the training data is: " + str(r2))

    print(model.stages[-1].featureImportances)

    sfu_predict = [("sfu", datetime.date(2018, 11,
                                         12), 49.2771, -122.9146, 330.0, 12.0),
                   ("sfu", datetime.date(2018, 11,
                                         13), 49.2771, -122.9146, 330.0, 12.0)]
    sfu_predict_df = spark.createDataFrame(sfu_predict, schema=tmax_schema)
    sfu_predict_df.show()
    sfu_predictions = model.transform(sfu_predict_df).select(
        'station', 'date', 'prediction')
    sfu_predictions.show()
def lab_classify(type,train,validation,query,figName):
    sql_transformer = SQLTransformer(statement = query)
    lab_assembler = VectorAssembler(inputCols=['labL','labA','labB'],outputCol='features')
    word_indexer = StringIndexer(inputCol='word',outputCol='label',stringOrderType='alphabetAsc')
    if (type == "MLPC"):
        classifier = MultilayerPerceptronClassifier(layers=[3, 25, 25],seed=42)
    elif (type == "LogReg"):
        classifier = LogisticRegression()
    lab_pipe = Pipeline(stages=[sql_transformer,lab_assembler, word_indexer, classifier])
    lab_model = lab_pipe.fit(train)
    predictions = lab_model.transform(validation)
    evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label', metricName='accuracy')
    score = evaluator.evaluate(predictions)
    plot_predictions(lab_model, 'LAB_'+figName, labelCol='word')
    return score
Esempio n. 31
0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from __future__ import print_function

from pyspark import SparkContext
# $example on$
from pyspark.ml.feature import SQLTransformer
# $example off$
from pyspark.sql import SQLContext

if __name__ == "__main__":
    sc = SparkContext(appName="SQLTransformerExample")
    sqlContext = SQLContext(sc)

    # $example on$
    df = sqlContext.createDataFrame([
        (0, 1.0, 3.0),
        (2, 2.0, 5.0)
    ], ["id", "v1", "v2"])
    sqlTrans = SQLTransformer(
        statement="SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__")
    sqlTrans.transform(df).show()
    # $example off$

    sc.stop()
# COMMAND ----------

from pyspark.ml.feature import RFormula

supervised = RFormula(formula="lab ~ . + color:value1 + color:value2")
supervised.fit(simpleDF).transform(simpleDF).show()


# COMMAND ----------

from pyspark.ml.feature import SQLTransformer

basicTransformation = SQLTransformer()\
  .setStatement("""
    SELECT sum(Quantity), count(*), CustomerID
    FROM __THIS__
    GROUP BY CustomerID
  """)

basicTransformation.transform(sales).show()


# COMMAND ----------

from pyspark.ml.feature import VectorAssembler
va = VectorAssembler().setInputCols(["int1", "int2", "int3"])
va.transform(fakeIntDF).show()


# COMMAND ----------