def process(spark, train_data, test_data): df_train = spark.read.parquet(train_data) df_test = spark.read.parquet(test_data) features = VectorAssembler(inputCols=df_train.columns[1:-1], outputCol='features') evaluator = RegressionEvaluator(labelCol='ctr', predictionCol='prediction', metricName='rmse') lr_model_base = LinearRegression(labelCol='ctr', **LR_PARAMS_BASE) lr_model_to_tune = LinearRegression(labelCol='ctr') lr_param_grid = ParamGridBuilder() \ .addGrid(lr_model_to_tune.maxIter, [5, 10, 20, 40, 50]) \ .addGrid(lr_model_to_tune.regParam, [0.4, 0.1, 0.01, 0.001]) \ .addGrid(lr_model_to_tune.fitIntercept, [False, True]) \ .addGrid(lr_model_to_tune.elasticNetParam, [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]) \ .build() tvs = TrainValidationSplit(estimator=lr_model_to_tune, estimatorParamMaps=lr_param_grid, evaluator=evaluator, trainRatio=0.8) pipeline_model_base = Pipeline( stages=[features, lr_model_base]).fit(df_train) prediction_base = pipeline_model_base.transform(df_test) rmse_base = evaluator.evaluate(prediction_base) print(f'Base lr model params: {LR_PARAMS_BASE}') print(f'RMSE at base lr model = {rmse_base}') print('Tuning lr model...') pipeline_model_tuned = Pipeline(stages=[features, tvs]).fit(df_train) prediction_tuned = pipeline_model_tuned.transform(df_test) rmse_tuned = evaluator.evaluate(prediction_tuned) model_java_obj = pipeline_model_tuned.stages[-1].bestModel._java_obj lr_params_tuned = { 'maxIter': model_java_obj.getMaxIter(), 'regParam': model_java_obj.getRegParam(), 'elasticNetParam': model_java_obj.getElasticNetParam(), 'fitIntercept': model_java_obj.getFitIntercept() } print(f'Base lr model params: {lr_params_tuned}') print(f'RMSE at tuned lr model = {rmse_tuned}') if rmse_tuned < rmse_base: pipeline_model_tuned.write().overwrite().save(MODEL_PATH) print(f'Tuned model has better RMSE value') else: pipeline_model_base.write().overwrite().save(MODEL_PATH) print(f'Base model has better RMSE value') print(f'Model saved at "{MODEL_PATH}"') spark.stop()
def transform_data_in_pipeline(df): """ :param df: :return: """ # Initialise pipeline variables stages = [] assembler_inputs = [] # Assemble features vector from Spark dataframe fields assembler = VectorAssembler( inputCols=['x', 'y', 'star_rating_number', 'avg_adr'], outputCol='features') stages += [assembler] assembler_inputs += [assembler.getOutputCol()] # Apply standard scaling with unit std and centroid about the mean scaler = StandardScaler(inputCol=assembler.getOutputCol(), outputCol='scaledFeatures') stages += [scaler] assembler_inputs += [scaler.getOutputCol()] # Execute the pipeline pipeline_model = Pipeline() \ .setStages(stages) \ .fit(df) # Return the dataframe with the additional transformed features vector return pipeline_model.transform(df)
def date_conversion(): df = sql.read.csv("./run/date_test_res.csv", inferSchema=True, header=True) datetime_formatting = DatetimeFormatting() model = Pipeline(stages=[datetime_formatting]).fit(df) res = model.transform(df) print("resulted_df") print(res.show())
def remove_skewness(): df = sql.read.csv("./run/file1.csv", inferSchema=True, header=True) min_skewness = MinimizeSkewness(['Purpose']) model = Pipeline(stages=[min_skewness]).fit(df) res = model.transform(df) print(res.show())
def process_df(df): time_seq.append(['start process-df', time.time()]) model = Pipeline(stages=[ RegexTokenizer(pattern=" ", inputCol="instruments", outputCol="instruments_tokenized", minTokenLength=1), NGram(n=1, inputCol="instruments_tokenized", outputCol="instruments_ngrams"), HashingTF(inputCol="instruments_ngrams", outputCol="instruments_vectors"), MinHashLSH(inputCol="instruments_vectors", outputCol="instruments_lsh", numHashTables=10) ]).fit(df) df_hashed = model.transform(df) df_matches = model.stages[-1].approxSimilarityJoin(df_hashed, df_hashed, 0.5, distCol="distance") \ .filter("datasetA.filename != datasetB.filename AND datasetA.filename < datasetB.filename") \ .select(f.col('datasetA.filename').alias('filename_A'), f.col('datasetB.filename').alias('filename_B'), f.col('distance')) time_seq.append(['process-df df_matches', time.time()]) write_df_to_pgsql(df_matches, 'filepair_similarity_run3') time_seq.append(['write pgsql', time.time()]) print('time_seq', time_seq)
def main(sc, spark): # Load and vectorize the corpus corpus = load_corpus(sc, spark) vector = make_vectorizer().fit(corpus) # Index the labels of the classification labelIndex = StringIndexer(inputCol="label", outputCol="indexedLabel") labelIndex = labelIndex.fit(corpus) # Split the data into training and test sets training, test = corpus.randomSplit([0.8, 0.2]) # Create the classifier clf = LogisticRegression( maxIter=10, regParam=0.3, elasticNetParam=0.8, family="multinomial", labelCol="indexedLabel", featuresCol="tfidf") # Create the model model = Pipeline(stages=[ vector, labelIndex, clf ]).fit(training) # Make predictions predictions = model.transform(test) predictions.select("prediction", "indexedLabel", "tfidf").show(5) # Select (prediction, true label) and compute test error evaluator = MulticlassClassificationEvaluator( labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Test Error = %g" % (1.0 - accuracy)) gbtModel = model.stages[2] print(gbtModel) # summary only
def train_als(ratings_data, split_prop, max_iter, reg_param, rank, cold_start_strategy): seed = 42 spark = pyspark.sql.SparkSession.builder.getOrCreate() ratings_df = spark.read.parquet(ratings_data) (training_df, test_df) = ratings_df.randomSplit([split_prop, 1 - split_prop], seed=seed) training_df.cache() test_df.cache() mlflow.log_metric("training_nrows", training_df.count()) mlflow.log_metric("test_nrows", test_df.count()) print("Training: {0}, test: {1}".format(training_df.count(), test_df.count())) als = ( ALS() .setUserCol("userId") .setItemCol("movieId") .setRatingCol("rating") .setPredictionCol("predictions") .setMaxIter(max_iter) .setSeed(seed) .setRegParam(reg_param) .setColdStartStrategy(cold_start_strategy) .setRank(rank) ) als_model = Pipeline(stages=[als]).fit(training_df) reg_eval = RegressionEvaluator(predictionCol="predictions", labelCol="rating", metricName="mse") predicted_test_dF = als_model.transform(test_df) test_mse = reg_eval.evaluate(predicted_test_dF) train_mse = reg_eval.evaluate(als_model.transform(training_df)) print("The model had a MSE on the test set of {0}".format(test_mse)) print("The model had a MSE on the (train) set of {0}".format(train_mse)) mlflow.log_metric("test_mse", test_mse) mlflow.log_metric("train_mse", train_mse) mlflow.spark.log_model(als_model, "als-model")
def remove_url_duplication(): df = sql.read.csv("./run/date_test_res.csv", inferSchema=True, header=True) url_duplication = RemovingDuplicationUrl() model = Pipeline(stages=[url_duplication]).fit(df) result = model.transform(df) result.toPandas().to_csv('./run/pipeline_url.csv') print("resulted_df") print(result.show())
def main(): input_dataset = sys.argv[1] output_dir = sys.argv[2] start_time = time.time() #stackoverflow_df = sqlContext.read.csv("../Datasource/stackOverFlow_ID_Title_SMALL.csv", header=True).toDF('id', 'text') stackoverflow_df = sqlContext.read.csv(input_dataset, header=True).toDF('id', 'text') # stackoverflow_df.show() # stackoverflow_df.head(10).show() # stack_df = stack_rdd.toDF(['id','text']) # stackoverflow_df.show() # stackoverflow_df.printSchema() model = Pipeline(stages=[ RegexTokenizer( pattern="", inputCol="text", outputCol="tokens", minTokenLength=1), NGram(n=3, inputCol="tokens", outputCol="ngrams"), HashingTF(inputCol="ngrams", outputCol="vectors"), MinHashLSH( inputCol="vectors", outputCol="lsh" ) #MinHashLSH(inputCol="vectors", outputCol="lsh", numHashTables=5) ]).fit(stackoverflow_df) db_hashed = model.transform(stackoverflow_df) # db_hashed.show() # query_hashed = model.transform(query) # db_hashed.show() # query_hashed.show() #res = model.stages[-1].approxSimilarityJoin(db_hashed, db_hashed, 0.90).filter("datasetA.id < datasetB.id") res = model.stages[-1].approxSimilarityJoin(db_hashed, db_hashed, 0.70).filter("distCol > 0") #print res #print res.count() res.show() elapsed_time = time.time() - start_time print 'Elapsed Time ==> ', elapsed_time
def model(self, pandas, column): rdd = self.sqlctx.createDataFrame(pandas.astype(str)) model = Pipeline(stages=[ Tokenizer(inputCol=column, outputCol="tokens"), StopWordsRemover(inputCol='tokens', outputCol="tokens_stop", stopWords=self.STOP_WORDS), HashingTF(inputCol="tokens_stop", outputCol="vectors") ]).fit(rdd) db_1 = model.transform(rdd) db_1.cache() return db_1
def test_serialize_to_bundle(self): string_map = StringMap( labels={'a': 1.0}, inputCol='key_col', outputCol='value_col', ) pipeline = Pipeline(stages=[string_map]).fit(self.input) serialization_dataset = pipeline.transform(self.input) jar_file_path = _serialize_to_file(pipeline, serialization_dataset) deserialized_pipeline = _deserialize_from_file(jar_file_path) result = deserialized_pipeline.transform(self.input) expected = StringMapTest.spark.createDataFrame([['a', 'b', 1.0]], OUTPUT_SCHEMA) assert_df(expected, result)
def columns_same_value(): try: df = sql.read.csv("./run/rem_test.csv", inferSchema=True, header=True) columns_with_same_val = ColumnsDroppingSameValue() model = Pipeline(stages=[columns_with_same_val]).fit(df) result = model.transform(df) result.toPandas().to_csv('./run/pipeline_same_value.csv') print(df.show()) print("#####################") print("resulted_df") print(result.show()) except Exception as e: logger.error(e)
def Indexer(spark, train_address, val_address, tst_address, repartition_size=10000): df_train = spark.read.parquet(train_address) df_val = spark.read.parquet(val_address) df_test = spark.read.parquet(tst_address) # user_indexer = StringIndexer(inputCol="user_id", outputCol="user_id_numeric").fit(df_train) # track_indexer = StringIndexer(inputCol="track_id", outputCol="track_id_numeric").fit(df_train.union(df_val)) # df_train = user_indexer.transform(df_train) # df_train = track_indexer.transform(df_train) # df_val = user_indexer.transform(df_val) # df_val = track_indexer.transform(df_val) # df_train = df_train.select("user_id_numeric","track_id_numeric","count") # df_val = df_val.select("user_id_numeric","track_id_numeric","count") # df_test = user_indexer.transform(df_test) # df_test = track_indexer.transform(df_test) user_indexer = StringIndexer(inputCol="user_id", outputCol="user_id_numeric") track_indexer = StringIndexer(inputCol="track_id", outputCol="track_id_numeric") # fit df_train only model = Pipeline(stages=[user_indexer, track_indexer]).fit(df_train) df_train, df_val, df_test = [ model.transform(x) for x in (df_train, df_val, df_test) ] df_train = df_train.select("user_id_numeric", "track_id_numeric", "count") df_val = df_val.select("user_id_numeric", "track_id_numeric", "count") df_test = df_test.select("user_id_numeric", "track_id_numeric", "count") # df_train = df_train.repartition(repartition_size,"user_id_numeric","track_id_numeric") # df_val = df_val.repartition(repartition_size,"user_id_numeric","track_id_numeric") # df_test = df_test.repartition(repartition_size,"user_id_numeric","track_id_numeric") # df_train.write.parquet("./train_formatted.parquet", mode='overwrite') # df_val.write.parquet("./val_formatted.parquet", mode='overwrite') # df_test.write.parquet("./test_formatted.parquet", mode='overwrite') print('Indexer succeed.') return df_train, df_val, df_test
def main(): potential_clones = sys.argv[1] outDir = sys.argv[2] start_time = time.time() potential_clones = '../Datasource/pc.xml' output_csv = 'csvCodes.csv' df = convertAndSaveAsCSV(potential_clones, output_csv, True) # spark context sc = SparkContext.getOrCreate() sqlContext = SQLContext(sc) spark_df = sqlContext.createDataFrame(df) transformed_spark_df = spark_df.rdd.map(distributedSourceTransform) pysparkdf_transformedClones = transformed_spark_df.toDF( ['filepath', 'startline', 'endline', 'source']) #pysparkdf_transformedClones.show() model = Pipeline(stages=[ RegexTokenizer(pattern=" ", inputCol="source", outputCol="tokens", minTokenLength=1), NGram(n=3, inputCol="tokens", outputCol="ngrams"), HashingTF(inputCol="ngrams", outputCol="vectors", numFeatures=262144), MinHashLSH( inputCol="vectors", outputCol="lsh", numHashTables=105 ) #MinHashLSH(inputCol="vectors", outputCol="lsh", numHashTables=5) ]).fit(pysparkdf_transformedClones) hashed_clones = model.transform(pysparkdf_transformedClones) clone_pairs = model.stages[-1].approxSimilarityJoin( hashed_clones, hashed_clones, 0.70).filter("distCol > 0") clone_pairs.show() elapsed_time = time.time() - start_time print 'Elapsed Time ==> ', elapsed_time
def main(argv): spark = SparkSession.builder \ .master("local[*]") \ .config("spark.driver.memory", "4g") \ .config("spark.executor.memory", "1g") \ .getOrCreate() features_df = ParquetDataFrame( f'data/processed/{Phase.train.name}/features', spark) test_data_frac = 0.1 test_features_df, train_features_df = features_df.randomSplit( [test_data_frac, 1 - test_data_frac]) label_col = 'duration_min' model = Pipeline(stages=[ StringIndexer(inputCol='pickup_cell_6', handleInvalid='keep', outputCol='pickup_cell_6_idx'), StringIndexer(inputCol='dropoff_cell_6', handleInvalid='keep', outputCol='dropoff_cell_6_idx'), VectorAssembler(inputCols=[ 'pickup_cell_6_idx', 'dropoff_cell_6_idx', 'distance', 'month', 'day_of_month', 'day_of_week', 'hour', 'requests_pickup_cell', 'requests_dropoff_cell' ], outputCol="features"), DecisionTreeRegressor( maxDepth=7, featuresCol='features', labelCol=label_col) ]).fit(train_features_df) model_path = 'model/trip_duration_min' print(f'Saving model to {model_path}') model.write().overwrite().save(model_path) print(f'Model saved...') model = PipelineModel.load(model_path) predictions_df = model.transform(test_features_df) mae_cv = RegressionEvaluator(labelCol=label_col, metricName='mae').evaluate(predictions_df) print(f'Mean absolutre error: {mae_cv}') spark.stop()
def remove_cols_containing_nan(): try: logger.debug("this is debug") df = sql.read.csv("./run/column_rem.csv", inferSchema=True, header=True) col_contains_nan = ColumnsDroppingContainsNan() model = Pipeline(stages=[col_contains_nan]).fit(df) result = model.transform(df) result.toPandas().to_csv('./run/pipeline_nan_value.csv') print(df.show()) print("#####################") print("resulted_df") print(result.show()) except Exception as e: logger.error(e)
def main(sc, spark): # Load and vectorize the corpus corpus = load_corpus(sc, spark) vector = make_vectorizer().fit(corpus) # Index the labels of the classification labelIndex = StringIndexer(inputCol="label", outputCol="indexedLabel") labelIndex = labelIndex.fit(corpus) # Split the data into training and test sets training, test = corpus.randomSplit([0.8, 0.2]) # Create the classifier clf = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, family="multinomial", labelCol="indexedLabel", featuresCol="tfidf") # Create the model model = Pipeline(stages=[vector, labelIndex, clf]).fit(training) # Make predictions predictions = model.transform(test) predictions.select("prediction", "indexedLabel", "tfidf").show(5) # Select (prediction, true label) and compute test error evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Test Error = %g" % (1.0 - accuracy)) gbtModel = model.stages[2] print(gbtModel) # summary only
sw_filter = StopWordsRemover()\ .setStopWords(stop_words)\ .setCaseSensitive(False)\ .setInputCol("words")\ .setOutputCol("filtered") from pyspark.ml.feature import CountVectorizer # we will remove words that appear in 5 docs or less cv = CountVectorizer(minTF=1., minDF=5., vocabSize=2**17)\ .setInputCol("filtered")\ .setOutputCol("tf") # we now create a pipelined transformer cv_pipeline = Pipeline(stages=[tokenizer, sw_filter, cv]).fit(review) cv_pipeline.transform(review).show(5) from pyspark.ml.feature import IDF idf = IDF().\ setInputCol('tf').\ setOutputCol('tfidf') idf_pipeline = Pipeline(stages=[cv_pipeline, idf]).fit(review) tfidf_df = idf_pipeline.transform(review) tfidf_df.show(10) #training_df, validation_df, testing_df = review.randomSplit([0.6, 0.3, 0.1], seed=0) #training_df, validation_df, testing_df = review.randomSplit([0.6, 0.3, 0.1], seed=0) #[training_df.count(), validation_df.count(), testing_df.count()]
# A linear regression object regression = LinearRegression(labelCol='duration') -------------------------------------------------- # Exercise_2 # Import class for creating a pipeline from pyspark.ml import Pipeline # Construct a pipeline pipeline = Pipeline(stages=[indexer, onehot, assembler, regression]) # Train the pipeline on the training data pipeline = pipeline.fit(flights_train) # Make predictions on the testing data predictions = pipeline.transform(flights_test) -------------------------------------------------- # Exercise_3 from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF # Break text into tokens at non-word characters tokenizer = Tokenizer(inputCol='text', outputCol='words') # Remove stop words remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='terms') # Apply the hashing trick and transform to TF-IDF hasher = HashingTF(inputCol=remover.getOutputCol(), outputCol="hash") idf = IDF(inputCol=hasher.getOutputCol(), outputCol="features")
def Indexer(spark, train_address, val_address, tst_address): beg = time() df_train = spark.read.parquet(train_address) df_val = spark.read.parquet(val_address) df_test = spark.read.parquet(tst_address) print('File Reading Finished') # subsample # here we include the last 110K users (for validation/testing) # want to include all 110K users at the end and randomly draw 10% others subsample_frac = 0.1 all_user_ids = [ row['user_id'] for row in df_train.select('user_id').distinct().collect() ] val_user_ids = [ row['user_id'] for row in df_val.select('user_id').distinct().collect() ] test_user_ids = [ row['user_id'] for row in df_test.select('user_id').distinct().collect() ] train_user_ids = list( set(all_user_ids) - set(val_user_ids) - set(test_user_ids)) selected_train_ids = sample(train_user_ids, round(len(train_user_ids) * 0.2)) # >>> len(all_user_ids) # 1129318 # >>> len(val_user_ids) # 10000 # >>> len(test_user_ids) # 100000 # >>> len(train_user_ids) # 1019318 df_train = df_train.where( df_train.user_id.isin(selected_train_ids + val_user_ids + test_user_ids)) print('Sampling Finished') user_indexer = StringIndexer(inputCol="user_id", outputCol="user_id_numeric") track_indexer = StringIndexer(inputCol="track_id", outputCol="track_id_numeric") model = Pipeline(stages=[user_indexer, track_indexer]).fit( df_train.union(df_val).union(df_test)) df_train, df_val, df_test = [ model.transform(x) for x in (df_train, df_val, df_test) ] df_train = df_train.select("user_id_numeric", "track_id_numeric", "count") df_val = df_val.select("user_id_numeric", "track_id_numeric", "count") df_test = df_test.select("user_id_numeric", "track_id_numeric", "count") print('Formatting Finished') df_train_subsampled.write.parquet("./train_formatted.parquet", mode='overwrite') df_val.write.parquet("./val_formatted.parquet", mode='overwrite') df_test.write.parquet("./test_formatted.parquet", mode='overwrite') end = time() print('Indexer and Subsampler succeed. Took %f s' % (end - beg)) return
assembler = VectorAssembler(inputCols=['weight_kg', 'cyl', 'type_dummy'], outputCol='features') # Split the data into training and testing sets kars_train, kars_test = kars.randomSplit([0.8, 0.2], seed=23) # Fit a Logistic Regression model to the training data regression = LinearRegression(labelCol='consumption') # Combine steps into a pipeline pipeline = Pipeline(stages=[indexer, onehot, assembler, regression]) # run fit on training data pipeline = pipeline.fit(kars_train) # Make predictions on the testing data prediction = pipeline.transform(kars_test) # Create a confusion matrix, comparing predictions to known labels prediction.groupBy("consumption", 'prediction').count().show(8) # Print the coefficients and intercept for linear regression print("Coefficients: %s" % str(pipeline.stages[REGRESSION_STAGE].coefficients)) print("Intercept: %s" % str(pipeline.stages[REGRESSION_STAGE].intercept)) # Summarize the model over the training set and print out some metrics trainingSummary = pipeline.stages[REGRESSION_STAGE].summary print("RMSE: %f" % trainingSummary.rootMeanSquaredError) print("r2: %f" % trainingSummary.r2) spark.stop() '''
mlSourceDF = featureeddf mlSourceDF.printSchema() mlSourceDF=mlSourceDF.fillna(0, subset= [x for x in mlSourceDF.columns if 'Lag' in x]) # after creating all lag features, we can drop NA columns on the key columns # drop na to avoid error in StringIndex mlSourceDF = mlSourceDF.na.drop(subset=["ServerIP","SessionStartHourTime"]) # indexing columnsForIndex = ['dayofweek', 'ServerIP', 'year', 'month', 'weekofyear', 'dayofmonth', 'hourofday', 'Holiday', 'BusinessHour', 'Morning'] mlSourceDF=mlSourceDF.fillna(0, subset= [x for x in columnsForIndex ]) sIndexers = [StringIndexer(inputCol=x, outputCol=x + '_indexed').setHandleInvalid("skip") for x in columnsForIndex] indexModel = Pipeline(stages=sIndexers).fit(mlSourceDF) mlSourceDF = indexModel.transform(mlSourceDF) # save model for operationalization indexModel.write().overwrite().save(stringIndexModelFile) # encoding for categorical features catVarNames=[x + '_indexed' for x in columnsForIndex ] columnOnlyIndexed = [ catVarNames[i] for i in range(0,len(catVarNames)) if len(indexModel.stages[i].labels)<2 ] columnForEncode = [ catVarNames[i] for i in range(0,len(catVarNames)) if len(indexModel.stages[i].labels)>=2 ] info['columnOnlyIndexed'] = columnOnlyIndexed info['columnForEncode'] = columnForEncode # save info to blob storage write_blob(info, infoFile, storageContainer, storageAccount, storageKey)
outputCol="features") ############ Classifiers rfC = RandomForestClassifier(labelCol="Survived", featuresCol="features", numTrees=300, maxDepth=5) gbtC = GBTClassifier(labelCol="Survived", featuresCol="features", maxIter=50) pipeline = Pipeline().setStages([ sex_stringIndexer, age_discretizer, fare_discretizer, embarked_stringIndexer, embarked_encoder, VectorAssembler, rfC ]).fit(train_df) ##### Applying pipeline train_piped = pipeline.transform(train_df) test_piped = pipeline.transform(test_df) ############################################### Feature importances print("\n----------- Feature importances") rfCmodel = pipeline.stages[6] for feature_name, feature_importance in sorted(zip( features_column, rfCmodel.featureImportances), key=lambda x: -x[1]): print("%20s: %s" % (feature_name, feature_importance)) ############################################## Exporting df_predictions = test_piped.select("prediction").toPandas().reset_index() df_predictions['index'] = df_predictions['index'] + 892 df_predictions.columns = ['PassengerId', 'Survived']
tree = DecisionTreeClassifier(labelCol='Survived') rf = RandomForestClassifier(labelCol='Survived') # 4. Create pipeline from pyspark.ml import Pipeline #pipeline = Pipeline(stages=[indexer, onehot, assembler, tree]) pipeline = Pipeline(stages=[ title_extractor, indexer1, indexer2, indexer3, onehot, assembler, rf ]) # 5. Fit the model pipeline = pipeline.fit(passengers_train) # 6. Make predictions #from pyspark.ml.evaluation import BinaryClassEvaluator prediction = pipeline.transform(passengers_train) prediction.show(5) prediction.select('Survived', 'prediction', 'probability').show(5, False) # Create a confusion matrix prediction.groupBy('Survived', 'prediction').count().show() TP = prediction.filter('Survived == 1 AND prediction == 1').count() TN = prediction.filter('Survived == 0 AND prediction == 0').count() FP = prediction.filter('Survived == 0 AND prediction == 1').count() FN = prediction.filter('Survived == 1 AND prediction == 0').count() # Compute accuracy accuracy = (TP + TN) / (TP + TN + FP + FN) print('Accuracy is %f' % accuracy)
df.show() df.cache() from pyspark.ml import Pipeline from pyspark.ml.feature import RegexTokenizer, NGram, HashingTF, MinHashLSH import pyspark.sql.functions as f model = Pipeline(stages=[ RegexTokenizer( pattern="", inputCol="title", outputCol="tokens", minTokenLength=1), NGram(n=3, inputCol="tokens", outputCol="ngrams"), HashingTF(inputCol="ngrams", outputCol="vectors"), MinHashLSH(inputCol="vectors", outputCol="lsh", numHashTables=10) ]).fit(df) df_hashed = model.transform(df) df_matches = model.stages[-1].approxSimilarityJoin(df_hashed, df_hashed, 0.9) #show all matches (including duplicates) df_matches.select( f.col('datasetA.id').alias('id_A'), f.col('datasetB.id').alias('id_B'), f.col('distCol')).show() #show non-duplicate matches df_matches.select( f.col('datasetA.id').alias('id_A'), f.col('datasetB.id').alias('id_B'), f.col('distCol')).filter('id_A < id_B').show()
rf_pipeline = Pipeline(stages=[va, rf]).fit(training_df) # In[26]: from pyspark.ml.evaluation import BinaryClassificationEvaluator # In[27]: bce = BinaryClassificationEvaluator() # In[28]: bce.evaluate(lr_pipeline.transform(validation_df)) # In[29]: bce.evaluate(rf_pipeline.transform(validation_df)) # In[30]: lr_model = lr_pipeline.stages[-1] # In[31]: pd.DataFrame(list(zip(airlineCleanDF.columns[12:19], lr_model.coefficients.toArray())),
def train( self, df: DataFrame, params_map: Optional[Dict[str, List[Any]]] = None, num_folds: Optional[int] = 10, collect_sub_models: Optional[bool] = False, return_cv: Optional[bool] = False ) -> Union[PipelineModel, Tuple[PipelineModel, CrossValidatorModel]]: """ Train model. Params ------ df: Spark DataFrame Input train data params_map: Optional[Dict[str, List[Any]]] (default=None) Parameters mapping to grid search over num_folds: Optional[int] (default=10) Number of cross-validation folds collect_sub_models: Optional[bool] (default=False) Collect models per fold per parameter combination return_cv: Optional[bool] (default=False) Additionally return the CrossValidatorModel object or not Returns ------- self: PipelineModel The (best) model trained on df. cv_model: Optional[CrossValidatorModel] The CrossValidatorModel object. """ # get input features binary, numeric, categorical = self._get_features(df) # convert categorical to numeric labels indexed_cols = [f'{c}_idx' for c in categorical] indexers = [ StringIndexer(inputCol=c[:-6], outputCol=c) for c in indexed_cols ] self.features = binary + numeric + indexed_cols self.logger.info(f'Final model features list: {self.features}') # assemble features into feature vector assembler = VectorAssembler(inputCols=self.features, outputCol=self.estimator.getFeaturesCol()) p = Pipeline(stages=indexers + [assembler]).fit(df) self.logger.info('Index and vector assemble features') df = p.transform(df)\ .select(self.estimator.getFeaturesCol(), self.estimator.getLabelCol()) # if provided, set estimator params map if params_map: self.params_map = params_map # run cross-validation and choose the best set of parameters self.logger.info('Start Cross Validation') cv_params = { 'estimator': self.estimator, 'estimatorParamMaps': self.__params_grid, 'evaluator': self.evaluator, 'numFolds': num_folds, 'collectSubModels': collect_sub_models } cv_model = CrossValidator(**cv_params).fit(df) # set the best model p.stages.append(cv_model.bestModel) self.best_model = p self.logger.info( f'Set the best model with best params: {self.best_params}') if return_cv: return self.best_model, cv_model else: return self.best_model
# define numerical assembler first for scaling numericalAssembler = VectorAssembler(inputCols=numericalColumnsImputed, outputCol='numerical_cols_imputed') stages += [numericalAssembler] # define the standard scaler stage for the numerical columns scaler = StandardScaler(inputCol='numerical_cols_imputed', outputCol="numerical_cols_imputed_scaled") stages += [scaler] # already a list so no need for brackets # Perform assembly stage to bring together features assemblerInputs = [c + "classVec" for c in categoricalColumns ] + ["numerical_cols_imputed_scaled"] # features contains everything, one hot encoded and numerical assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features") stages += [assembler] # define the model stage at the end of the pipeline lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10) stages += [lr] # Random train test split with seed (trainingData, testData) = data.randomSplit([0.7, 0.3], seed=100) # Define the entire pipeline and fit on the train data and transform on the test data clfPipeline = Pipeline().setStages(stages).fit(trainingData) clfPipeline.transform(testData)
"select * from cours_spark.meteoMensuelle order by 1").cache() modelA = VectorAssembler().\ setInputCols( ['Janvier', 'Fevrier', 'Mars', 'Avril', 'Mai', 'Juin', 'Juillet', 'Aout', 'Septembre', 'Octobre', 'Novembre', 'Decembre']).\ setOutputCol('variables') modelN = StandardScaler().\ setInputCol("variables").\ setOutputCol("vNormalisees").\ setWithStd(True).\ setWithMean(False) modelACP = PCA().\ setInputCol("vNormalisees").\ setOutputCol("vACP").\ setK(2) modelKM = KMeans().setK(7).\ setFeaturesCol("vACP").\ setPredictionCol("vKM") modelPipe = Pipeline(stages=[modelA, modelN, modelACP, modelKM]).fit(donnees) donneesKM = modelPipe.transform(donnees) donneesKM.select("Ville", "vKM").show(5)
class Pipeline: def __init__(self, spark, train_date, score_date, train_pct, validate_pct=0.): assert (0. <= train_pct <= 1.) assert (0. <= validate_pct <= 1.) assert (train_pct + validate_pct <= 1.) self.spark = spark self.train_date = train_date self.score_date = score_date self.train_pct = train_pct self.validate_pct = validate_pct def _load_feature_df(self, feature_date, training): phase = 'training' if training else 'scoring' print('[ ' + str(datetime.utcnow()) + ' ] : Loading ' + phase + ' feature data') table_suffix = 't365d' if training else 'scoring' features_df = self.spark.sql('from grp_gdoop_clv_db.keep_cdf_final_features_' + table_suffix) \ .filter(F.col('record_date') == feature_date) \ .drop('record_date', 'zip_code_cat_x') if training: target_df = self.spark.sql('select * from grp_gdoop_clv_db.ce_keep_deact_target') \ .filter(F.col('record_date') == feature_date) \ .select('consumer_id', 'deactivated') final = features_df \ .join(target_df, features_df.consumer_id == target_df.consumer_id, how='left') \ .drop(target_df.consumer_id) return final else: return features_df def _train_validate_split(self, features_df): print('[ ' + str(datetime.utcnow()) + ' ] : Splitting training data into model training and validation data') splits = features_df.randomSplit([self.train_pct, self.validate_pct, 1 - self.train_pct - self.validate_pct]) return splits[0], splits[1] def _make_feature_list(self, all_cols, cat_cols, indexers): features = list(filter(lambda x: x.endswith('_x') and not x.endswith('_cat_x'), all_cols)) for i, col in enumerate(cat_cols): for label in indexers[i].labels: features.extend([col + '_' + re.sub('\W+', '_', str(label).strip())]) self.feature_list = features def _one_hot_encode_pl(self, train_raw): print('[ ' + str(datetime.utcnow()) + ' ] : Creating feature engineering pipeline') all_cols = train_raw.columns cat_cols = list(filter(lambda x: x.endswith('_cat_x'), all_cols)) indexers = [StringIndexer(inputCol=c, outputCol=c.replace('_cat_x', '_index'), handleInvalid='keep') for c in cat_cols] one_hots = [OneHotEncoderEstimator(inputCols=[c.replace('_cat_x', '_index')], outputCols=[c.replace('_cat_x', '_vec_x')], handleInvalid='keep', dropLast=False) for c in cat_cols] self.one_hot_plm = MLPipeline(stages=indexers + one_hots).fit(train_raw) self._make_feature_list(all_cols, cat_cols, self.one_hot_plm.stages[:len(cat_cols)]) def _assemble_features(self, raw_df, data_type): print('[ ' + str(datetime.utcnow()) + ' ] : Feature engineering ' + data_type + ' data') cat_cols = list(filter(lambda x: x.endswith('_cat_x'), raw_df.columns)) df = self.one_hot_plm.transform(raw_df) \ .drop(*cat_cols) features = list(filter(lambda x: x.endswith('_x'), df.columns)) assembler = VectorAssembler(inputCols=features, outputCol='features', handleInvalid='keep') return assembler.transform(df) def _training_data(self, validate_model): train_features_df = self._load_feature_df(self.train_date, True) train_raw, validate_raw = self._train_validate_split(train_features_df) train_raw.cache() # Create one-hot encoding pipeline that will be applied to all DFs self._one_hot_encode_pl(train_raw) train_df = self._assemble_features(train_raw, 'model training').cache() train_raw.unpersist() if validate_model: validate_raw.cache() validate_df = self._assemble_features(validate_raw, 'model validation').cache() validate_raw.unpersist() else: validate_df = None return train_df, validate_df def _scoring_data(self): score_features_df = self._load_feature_df(self.score_date, False).cache() score_df = self._assemble_features(score_features_df, 'scoring').cache() score_features_df.unpersist() return score_df def run(self, validate_model, score_active_users): print('\nDATA PIPELINE\n') train_df, validate_df = self._training_data(validate_model) if score_active_users: score_df = self._scoring_data() else: score_df = None return {'training': train_df, 'validation': validate_df, 'scoring': score_df, 'features': self.feature_list} def __repr__(self): return '<Pipeline(train_date={0}, score_date={1})>'.format(self.train_date, self.score_date) def __str__(self): return '<Pipeline(train_date={0}, score_date={1})>'.format(self.train_date, self.score_date)
onthot = onthot.fit(cars_train) cars_train = onehot.transform(cars_train) cars_train =assemble.transform(cars_train) # Fit model to training data regression = regression.fit(cars_train) # Testing data cars_test = indexer.transform(cars_test) cars_test = onthot.transform(cars_test) cars_test = assemble.transform(cars_test) # Make predictions on testing data predictions = regression.transform(cars_test) # Cars model: Pipeline from pyspark.ml import Pipeline pipeline = Pipeline(stages=[indexer, onehot, assemble, regression]) pipeline = pipenline.fit(cars_train) predictions = pipeline.transform(cars_test) # Cars model: stages # The LinearRegression object (fourth stage -> index 3) print(pipeline.stages[3].intercept) print(pipeline.stages[3].coefficients) # Convert categorical strings to index values indexer = StringIndexer(inputCol='org', outputCol='org_idx') # One-hot encode index values onehot = OneHotEncoderEstimator( inputCols=['org_idx', 'dow'], outputCols=['org_dummy', 'dow_dummy'] ) # Assemble predictors into a single column