def transform_input(input_text): ''' ''' lines = [(input_text, )] df = spark.createDataFrame(lines, ['text']) def removePunctuation(text): text = text.lower().strip() text = re.sub('[^0-9a-zA-Z ]', '', text) return text remove_punt_udf = udf(removePunctuation, StringType()) tokenizer = Tokenizer(inputCol='text_noPunct', outputCol='token_text') df_new = df.withColumn('text_noPunct', remove_punt_udf('text')) df_new = tokenizer.transform(df_new) def remove_blank_token(text): text = list(filter(lambda x: x != '', text)) return text remove_blank_token_udf = udf(remove_blank_token, ArrayType(StringType())) df_new = df_new.withColumn('token_text', remove_blank_token_udf('token_text')) sw_remover = StopWordsRemover(inputCol='token_text', outputCol='stop_token') normalizer = Normalizer(inputCol='w2v', outputCol='w2v_norm') pipe = PipelineModel(stages=(sw_remover, w2v_model, normalizer)) df_final = pipe.transform(df_new) return df_final
def process(rdd): spark = getSparkSessionInstance(rdd.context.getConf()) dota = rdd.map(lambda x: x[1]) featuresdata = dota.map(lambda x: x.split(':')[2]) actualdata = featuresdata.map(lambda x: x.split(',')) rowRdd = actualdata.map(lambda x: Row(sl=float(x[0][1:]), sw=float(x[1]), pl=float(x[2]), pw=float(x[3]), stringlabel=x[4][:-4])) features = spark.createDataFrame(rowRdd) features.show() rowRdd = actualdata.map(lambda x: Row(sl=float(x[0]), sw=float(x[1]), pl=float(x[2]), pw=float(x[3]), stringlabel=x[4])) indexer = StringIndexerModel() assembler = VectorAssembler() lr = LogisticRegressionModel() pipe = PipelineModel(stages=[indexer,assembler,lr]).load('gs://suryasuresh/lab8output') result = pipe.transform(features) f1score = MulticlassClassificationEvaluator(metricName='f1') precision = MulticlassClassificationEvaluator(metricName='weightedPrecision') recall = MulticlassClassificationEvaluator(metricName='weightedRecall') accuracy = MulticlassClassificationEvaluator(metricName='accuracy') print(result.values) print("Accuracy:\t",accuracy.evaluate(result),"\nF1score:\t",f1score.evaluate(result),"\nWeighted Recall:\t",recall.evaluate(result),"\nWeighted Precision:\t",precision.evaluate(result))
def process_data(df: DataFrame, ml_model: PipelineModel = model) -> DataFrame: df = convert_types_for_ml(df) df = convert_heroes_to_lineup(df) df = ml_model.transform(df) df = convert_types_for_kafka(df) return df
def process_data(df: DataFrame, ml_model: PipelineModel = model) -> DataFrame: df = convert_types_for_ml(df) df = convert_heroes_to_lineup(df) df = ml_model.transform(df) df = convert_types_for_es(df) return df.select("probability_arr", "radiant_win_prediction", "match_seq_num")
def predicate(self, featurizer_name, classifier, test_df): featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName=featurizer_name) predictor = PipelineModel(stages=[featurizer, classifier]) predictions = predictor.transform(test_df) return predictions
class Pipe(Transformer): """Conditional pipeline which runs one or another list of transformers based on condition""" def __init__(self, stages: List[Transformer]): super(Pipe, self).__init__() self._pipeline = PipelineModel(stages) def _transform(self, dataset: DataFrame) -> DataFrame: return self._pipeline.transform(dataset)
def get_predictions(): s3_name = get_best_model() model = get_model(s3_name) df = get_data(test=True) df, stage_pca, first_stages = rebuild_pipeline(s3_name, df) print("Modelo evaluado: ", model, "con params: ", model.explainParams()) df_assem = first_stages.transform(df) model_pca = stage_pca.fit(df_assem) # Creates Pipeline pipeline = PipelineModel(stages=[first_stages, model_pca, model]) prediction = pipeline.transform(df) #vars_pred = ['rawPrediction','probability', 'prediction', 'distance', 'flight_number_reporting_airline'] vars_pred = [ 'dayofmonth', 'prediction', 'distance', 'flight_number_reporting_airline' ] df_pred = prediction.select( [c for c in prediction.columns if c in vars_pred]) df_pred = df_pred.withColumn('s3_name', lit(s3_name)) df_pred = df_pred.withColumn( 'auxi', f.when(f.col('dayofmonth') < 9, "0").otherwise("")) df_pred = df_pred.withColumn( 'fecha', concat(lit("2020"), lit("02"), col('auxi'), col('dayofmonth'))) vars_pred = [ 'flight_number_reporting_airline', 'prediction', 'distance', 's3_name', 'fecha' ] df_pred = df_pred.select([c for c in df_pred.columns if c in vars_pred]) return df_pred, s3_name
.setSubscriptionKey(TEXT_API_KEY)\ .setOutputCol("sentiment") #Extract the sentiment score from the API response body getSentiment = SQLTransformer( statement="SELECT *, sentiment[0].sentiment as sentimentLabel FROM __THIS__" ) # COMMAND ---------- # MAGIC %md # MAGIC ### Tying it all together # MAGIC # MAGIC Now that we have built the stages of our pipeline its time to chain them together into a single model that can be used to process batches of incoming data # MAGIC # MAGIC <img src="https://mmlspark.blob.core.windows.net/graphics/Cog%20Service%20NB/full_pipe_2.jpg" width="800" style="float: center;"/> # COMMAND ---------- from mmlspark.stages import SelectColumns # Select the final coulmns cleanupColumns = SelectColumns().setCols( ["url", "firstCeleb", "text", "sentimentLabel"]) celebrityQuoteAnalysis = PipelineModel(stages=[ bingSearch, getUrls, celebs, firstCeleb, recognizeText, getText, sentimentTransformer, getSentiment, cleanupColumns ]) celebrityQuoteAnalysis.transform(bingParameters).show(5)
from pyspark import SparkContext, SparkConf from pyspark.sql import SQLContext import sparkdl as dl from pyspark.ml.classification import LogisticRegressionModel from pyspark.ml import Pipeline, PipelineModel conf = SparkConf().setAppName("image_testset").setMaster("yarn") sc = SparkContext(conf=conf) sql_sc = SQLContext(sc) lr_test = LogisticRegressionModel.load('hdfs:///lr') featurizer_test = dl.DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3") p_lr_test = PipelineModel(stages=[featurizer_test, lr_test]) image_path = "hdfs:///project_data/pets/test_images/" image_DF = dl.readImages(image_path) image_DF.show(10) tested_lr_test = p_lr_test.transform(image_DF) tested_lr_test.sample(False, 0.1).show()
.setTextCol("checkin_comment")\ .setUrl("https://{}.api.cognitive.microsoft.com/text/analytics/v3.0/sentiment".format(cognitive_location))\ .setSubscriptionKey(TEXT_API_KEY)\ .setOutputCol("sentiment") #Extract the sentiment score from the API response body getSentiment = SQLTransformer( statement="SELECT *, sentiment[0].sentiment as sentimentLabel FROM __THIS__" ) # COMMAND ---------- celebrityQuoteAnalysis = PipelineModel( stages=[sentimentTransformer, getSentiment]) display(celebrityQuoteAnalysis.transform(spark.table('facts'))) # COMMAND ---------- keyPhrasesTransformer = TextSentiment()\ .setTextCol("description")\ .setUrl("https://{}.api.cognitive.microsoft.com/text/analytics/v3.0/keyPhrases".format(cognitive_location))\ .setSubscriptionKey(TEXT_API_KEY)\ .setOutputCol("keyPhrases") #Extract the sentiment score from the API response body # getSentiment = SQLTransformer(statement="SELECT *, sentiment[0].sentiment as sentimentLabel FROM __THIS__") # COMMAND ---------- keyPhrasesAnalysis = PipelineModel(stages=[keyPhrasesTransformer])
sentimentTransformer = TextSentiment()\ .setTextCol("checkin_comment")\ .setUrl("https://{}.api.cognitive.microsoft.com/text/analytics/v3.0/sentiment".format(cognitive_location))\ .setSubscriptionKey(TEXT_API_KEY)\ .setOutputCol("sentiment") #Extract the sentiment score from the API response body # unneeded when doing raw capture # getSentiment = SQLTransformer(statement="SELECT *, sentiment[0].sentiment as sentimentLabel FROM __THIS__") # COMMAND ---------- commentSentimentAnalysis = PipelineModel(stages=[sentimentTransformer]) df_checkin_comments = df.select(df.checkin_comment, df.checkin_id) df_sentiment = commentSentimentAnalysis.transform(df_checkin_comments).drop( col('checkin_comment')) error_column_name = df_sentiment.columns[1] df_sentiment_renamed = df_sentiment.withColumnRenamed(error_column_name, 'TextSentiment_error') df = df_sentiment_renamed # COMMAND ---------- sentiment_raw_path = base_path + 'raw/sentiment/{}/{}/{}/untappd.json'.format( date.year, date.month, date.day) sentiment_raw_delta_path = base_path + 'raw/sentiment/delta' sentiment_query_path = base_path + 'query/sentiment' # COMMAND ---------- # df_sentiment_renamed.write.format('delta').mode("append").save(sentiment_raw_delta_path)
b_df = dl.readImages(img_dir + "/b" + m).withColumn("label", lit(1)) m_df = dl.readImages(img_dir + "/m" + m).withColumn("label", lit(0)) #Splitting the data into training and test in the ratio 80% & 20% trainb, testb = b_df.randomSplit([80.00, 20.00], seed=42) trainm, testm = m_df.randomSplit([80.00, 20.00], seed=42) #combining the dataset benign and malignanent for the training and testing trainDF = trainb.unionAll(trainm) testDF = testb.unionAll(testm) lr_test = LogisticRegressionModel.load('./test-' + m) # Use a featurizer to use trained features from an existing model featurizer_test = dl.DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3") # Setup a pipeline p_lr_test = PipelineModel(stages=[featurizer_test, lr_test]) # Test and evaluate tested_lr_test = p_lr_test.transform(testDF) evaluator_lr_test = MulticlassClassificationEvaluator( metricName="accuracy") print("Logistic Regression Model: Test set accuracy = " + str( evaluator_lr_test.evaluate(tested_lr_test.select( "prediction", "label")))) tested_lr_test.select("label", "probability", "prediction").show(20, False)
def score_model(data: pyspark.sql.DataFrame, model: PipelineModel) -> pyspark.sql.DataFrame: predictions_test = model.transform(data) return predictions_test
TEXT_API_KEY = cognitive_key # VISION_API_KEY = os.environ["VISION_API_KEY"] # BING_IMAGE_SEARCH_KEY = os.environ["BING_IMAGE_SEARCH_KEY"] # COMMAND ---------- # MAGIC %sql # MAGIC SELECT * from facts # COMMAND ---------- sentimentTransformer = TextSentiment()\ .setTextCol("checkin_comment")\ .setUrl("https://{}.api.cognitive.microsoft.com/text/analytics/v3.0/sentiment".format(cognitive_location))\ .setSubscriptionKey(TEXT_API_KEY)\ .setOutputCol("sentiment") #Extract the sentiment score from the API response body getSentiment = SQLTransformer( statement="SELECT *, sentiment[0].sentiment as sentimentLabel FROM __THIS__" ) # COMMAND ---------- celebrityQuoteAnalysis = PipelineModel( stages=[sentimentTransformer, getSentiment]) display(celebrityQuoteAnalysis.transform(spark.table('facts'))) # COMMAND ----------
img_rescaled = resizeimage.resize_cover(new_im, [width, width]) img_rescaled.save("{}/rescaled/{}".format(root, img)) if __name__ == "__main__": sc = SparkContext() img_dic = joblib.load("dictionary.pkl")[0] featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3") lr = LogisticRegressionModel.load('./lrModel') p_model = PipelineModel(stages=[featurizer, lr]) directory = "./media" rescaled_dir = "{}/rescaled".format(directory) rescale_image(directory, rescaled_dir) temp_df = ImageSchema.readImages(rescaled_dir) df = p_model.transform(temp_df) f = open("predict_output.txt", "r+") f.seek(0) f.truncate() for i in df.select(['image', 'prediction']).collect(): print("{} = {}".format(i[0][0].split('/')[-1], img_dic[int(i[1])])) f.write("{} = {}\n".format(i[0][0].split('/')[-1], img_dic[int(i[1])])) f.close() shutil.rmtree(rescaled_dir) # spark-submit --packages databricks:spark-deep-learning:1.5.0-spark2.4-s_2.11 predict.py
def run_pipeline(df: pyspark.sql.DataFrame, pipeline: PipelineModel) -> int: imputed_df = pipeline.transform(df) return imputed_df.filter(imputed_df["imputed_age"].isNull()).count()
trainingDF.show(10) print("show over") vectorizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName='InceptionV3') logreg = LogisticRegression(maxIter=30,regParam=0.05, elasticNetParam=0.3, labelCol = "label", featuresCol="features") pipeline = Pipeline(stages=[vectorizer, logreg]) pipeline_model = pipeline.fit(trainingDF) lrModel = pipeline_model lrModel.stages[1].write().overwrite().save('hdfs:///lr') from pyspark.ml.classification import LogisticRegressionModel from pyspark.ml import Pipeline, PipelineModel from pyspark.ml.evaluation import MulticlassClassificationEvaluator from pyspark.ml.evaluation import BinaryClassificationEvaluator lr_test = LogisticRegressionModel.load('hdfs:///lr') # Use a featurizer to use trained features from an existing model featurizer_test = dl.DeepImageFeaturizer(inputCol = "image", outputCol = "features", modelName = "InceptionV3") # Pipeline both entities p_lr_test = PipelineModel(stages=[featurizer_test, lr_test]) # Test and evaluate tested_lr_test = p_lr_test.transform(validationDF) evaluator_lr_test = MulticlassClassificationEvaluator(metricName = "accuracy") print("Logistic Regression Model: Test set accuracy = " + str(evaluator_lr_test.evaluate(tested_lr_test.select("prediction", "label")))) tested_lr_test.select("label", "probability", "prediction").show(10)