sc = SparkContext("local", "Features - IndexToString") sqlContext = SQLContext(sc) spark = SparkSession.builder.getOrCreate() df = spark.createDataFrame([(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")], ["id", "category"]) indexer = StringIndexer(inputCol="category", outputCol="categoryIndex") model = indexer.fit(df) indexed = model.transform(df) print("Transformed string column '%s' to indexed column '%s'" % (indexer.getInputCol(), indexer.getOutputCol())) indexed.show() print("StringIndexer will store labels in output column metadata\n") converter = IndexToString(inputCol="categoryIndex", outputCol="originalCategory") converted = converter.transform(indexed) print( "Transformed indexed column '%s' back to original string column '%s' using " "labels in metadata" % (converter.getInputCol(), converter.getOutputCol())) converted.select("id", "categoryIndex", "originalCategory").show() spark.stop()
def test_list_string(self): for labels in [np.array(['a', u'b']), ['a', u'b'], np.array(['a', 'b'])]: idx_to_string = IndexToString(labels=labels) self.assertListEqual(idx_to_string.getLabels(), ['a', 'b']) self.assertRaises(TypeError, lambda: IndexToString(labels=['a', 2]))
## creating the pipeline # pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover,countVectors, label_stringIdx]+[lr]) # Fit the pipeline to training documents. df = data_filtered.select('trend', 'creation_time', "twid", "text_words") pipelineFit = pipeline.fit(df) dataset = pipelineFit.transform(df) # dataset.show(5) # dataset.count() predictions = model.transform(dataset) labeler = IndexToString(inputCol="prediction", outputCol="topic", labels=[ 'event', 'sports', 'politics', 'news', 'technology', 'business', 'entertainment', 'health' ]) # print(predictions) prediciton_with_label = labeler.transform(predictions) # prediciton_with_label.show(5) # print(prediciton_with_label.count()) # ta = data.alias('ta') # tb = prediciton_with_label.select('trend','creation_time','twid','topic').alias('tb') prediciton_with_label.write.mode('append').format( 'org.apache.spark.sql.cassandra').options(table='tweet', keyspace='graphy').save() # final_df=ta.join(tb,(ta.twid==tb.twid) & (ta.creation_time==tb.creation_time) & (ta.trend==tb.trend),how="left").select(ta.trend,ta.creation_time,ta.twid,ta.body,ta.location,ta.topic,ta.user,tb.predictedLabel) # final_df.show()
def fit(self, data): '''Dataset must at least contain the following two columns: label: the class labels features: feature vector Attributes ---------- data (Dataset<Row>): input data Returns ------- map with metrics ''' classCount = int(data.select(self.label).distinct().count()) labelIndexer = StringIndexer().setInputCol(self.label) \ .setOutputCol("indexedLabel") \ .fit(data) # Split the data into training and test sets (30% held out for testing) splits = data.randomSplit([1.0 - self.testFraction, self.testFraction], self.seed) trainingData = splits[0] testData = splits[1] labels = labelIndexer.labels print("\n Class\tTrain\tTest") for l in labels: print("%s\t%i\t%i" % (l \ ,(trainingData.filter(trainingData[self.label] == l)).count() \ ,(testData.filter(testData[self.label] == l)).count() \ ) ) # Set input columns self.predictor.setLabelCol("indexedLabel").setFeaturesCol("features") # Convert indexed labels back to original labels labelConverter = IndexToString().setInputCol("prediction") \ .setOutputCol("predictedLabel") \ .setLabels(labelIndexer.labels) # Chain indexers and forest ina Pipline pipeline = Pipeline().setStages( [labelIndexer, self.predictor, labelConverter]) # Train model. This also runs the indexers model = pipeline.fit(trainingData) # Make predictions predictions = model.transform(testData).cache() # Display some sample predictions print(f"\nSample predictions: {str(self.predictor).split('_')[0]}" ) # TODO predictor.getClass().getSimpleName() predictions.sample(False, 0.1, self.seed).show(25) predictions = predictions.withColumnRenamed(self.label, "stringLabel") predictions = predictions.withColumnRenamed("indexedLabel", self.label) # Collect metrics pred = predictions.select("prediction", self.label) metrics = OrderedDict() metrics["Method"] = str(self.predictor).split('_')[0] if classCount == 2: b = BinaryClassificationMetrics(pred) metrics["AUC"] = str(b.areaUnderROC()) m = MulticlassMetrics(pred.rdd) metrics["F"] = str(m.weightedFMeasure()) metrics["Accuracy"] = str(m.accuracy) metrics["Precision"] = str(m.weightedPrecision) metrics["Recall"] = str(m.weightedRecall) metrics["False Positive Rase"] = str(m.weightedFalsePositiveRate) metrics["True Positive Rate"] = str(m.weightedTruePositiveRate) metrics[""] = f"\nConfusion Matrix\n{labels}\n{m.confusionMatrix()}" return metrics
model = indexer.fit(df) indexed = model.transform(df) indexed.show() # # IndexToString # 与StringIndexer相对应,IndexToString的作用是把标签索引的一列重新映射回原有的字符型标签。 # # 其主要使用场景一般都是和StringIndexer配合,先用StringIndexer将标签转化成标签索引,进行模型训练,然后在预测标签的时候再把标签索引转化成原有的字符标签。当然,你也可以另外定义其他的标签。 # # 首先,和StringIndexer的实验相同,我们用StringIndexer读取数据集中的“category”列,把字符型标签转化成标签索引,然后输出到“categoryIndex”列上,构建出新的DataFrame。 # In[35]: converter = IndexToString(inputCol="categoryIndex", outputCol="originalCategory") converted = converter.transform(indexed) converted.select("id", "categoryIndex", "originalCategory").show() # # VectorIndexer # 之前介绍的StringIndexer是针对单个类别型特征进行转换,倘若所有特征都已经被组织在一个向量中,又想对其中某些单个分量进行处理时,Spark ML提供了VectorIndexer类来解决向量数据集中的类别性特征转换。 # # 通过为其提供maxCategories超参数,它可以自动识别哪些特征是类别型的,并且将原始值转换为类别索引。它基于不同特征值的数量来识别哪些特征需要被类别化,那些取值可能性最多不超过maxCategories的特征需要会被认为是类别型的。 # # 在下面的例子中,我们读入一个数据集,然后使用VectorIndexer训练出模型,来决定哪些特征需要被作为类别特征,将类别特征转换为索引,这里设置maxCategories为10,即只有种类小10的特征才被认为是类别型特征,否则被认为是连续型特征: # In[42]: from pyspark.ml.feature import VectorIndexer
encoder = OneHotEncoderEstimator( inputCols=["UniqueCarrierInd", "OriginInd", "DestInd"], outputCols=["UniqueCarrierOHE", "OriginOHE", "DestOHE"]) assembler = VectorAssembler(inputCols=[ "Month", "Day", "DayOfWeek", "CRSDepHour", "UniqueCarrierOHE", "OriginOHE", "DestOHE" ], outputCol="features") classifier = RandomForestClassifier(labelCol='delayCatInd', featuresCol='features', numTrees=10, maxDepth=10, maxBins=500, predictionCol="prediction") labelConv = IndexToString(inputCol="prediction", outputCol="predictedLabel", labels=labelInd.labels) pipeline = Pipeline(stages=[ ucInd, oInd, dInd, labelInd, encoder, assembler, classifier, labelConv ]) (train, test) = df2.randomSplit([0.7, 0.3]) model = pipeline.fit(train) predictions = model.transform(test) predictions.head() # ## Model Evalution # Select (prediction, true label) and compute test error evaluator = MulticlassClassificationEvaluator(labelCol="delayCatInd", predictionCol="prediction", metricName="accuracy")
def main(args): fields = [StructField("hashcodefile", StringType(), True), StructField("label", StringType(), True), StructField("n-grams", ArrayType(StringType(), True), True)] schema = StructType(fields) fieldsTest = [StructField("hashcodefile", StringType(), True), StructField("n-grams", ArrayType(StringType(), True), True)] schemaTest = StructType(fieldsTest) ## args[0] Preprocessd training Parquet file of byte or opcode trainingParque=spark.read.parquet(args[0]) print ("Parquet File read completed") #Creating for Trainig and Testing N-Gram # args[1]: No of grams: 1,2,3,4,....N ngram = NGram(n=args[1], inputCol="content", outputCol="n-grams") ngramDataFrame = ngram.transform(trainingParque).select("hashcodefile","label","n-grams") ngramRDD = ngramDataFrame.rdd # args[2]: Preprocessed Testing Parquet file of byte or opcode testingParqueTemp = spark.read.parquet(args[2]) ngramTestData = NGram(n=args[1], inputCol="content", outputCol="n-grams") ngramTestDataFrame = ngramTestData.transform(testingParqueTemp).select("hashcodefile","n-grams") ngramTestDataRDD=ngramTestDataFrame.rdd inputNgram=spark.createDataFrame(ngramRDD,schema) inputTestNgram = spark.createDataFrame(ngramTestDataRDD, schemaTest) print("N-gram completed for testing & training") ################################################################################ # Count Vectorizer for training data set ################################################################################ cv = CountVectorizer(inputCol="n-grams", outputCol="features", vocabSize=1000, minDF=1.0,minTF=2.0) model = cv.fit(inputNgram) featurizedData = model.transform(inputNgram).select("hashcodefile","label","features") print ("Term Frequency completed for training data set") # # ###################################### # Count Vectorizer for testing data set # # ###################################### cvTest = CountVectorizer(inputCol="n-grams", outputCol="features", vocabSize=1000, minDF=1.0,minTF=2.0) modelTest = cvTest.fit(inputTestNgram) featurizedTestData = modelTest.transform(inputTestNgram).select("hashcodefile","features") featurizedTestData.write.parquet("opcodeFeaturesTesting.parquet") print("Term Frequency completed for testing data set") ################################################################### # Code for Random Forest Classifier ################################################################## labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(featurizedData) # # Train a RandomForest model. randomforest = RandomForestClassifier(labelCol="indexedLabel", featuresCol="features", numTrees=600,maxDepth=10) # Convert indexed labels back to original labels. labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel", labels=labelIndexer.labels) # # Chain indexers and forest in a Pipeline pipeline = Pipeline(stages=[labelIndexer,randomforest,labelConverter]) # # Train model. This also runs the indexers. model = pipeline.fit(featurizedData) predictions = model.transform(featurizedTestData) filterPredictions=predictions.select("predictedLabel","hashcodefile") predictionsRDD=filterPredictions.rdd predictionsRDD.saveAsTextFile("output.text")
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from __future__ import print_function # $example on$ from pyspark.ml.feature import IndexToString, StringIndexer # $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession.builder.appName("IndexToStringExample").getOrCreate() # $example on$ df = spark.createDataFrame([(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")], ["id", "category"]) stringIndexer = StringIndexer(inputCol="category", outputCol="categoryIndex") model = stringIndexer.fit(df) indexed = model.transform(df) converter = IndexToString(inputCol="categoryIndex", outputCol="originalCategory") converted = converter.transform(indexed) converted.select("id", "originalCategory").show() # $example off$ spark.stop()
outputCol=cat_col + 'index', handleInvalid='keep').fit(df_data) for cat_col in cat_cols ] # ## categorical target column and use of StringIndexer and IndexToString # In[11]: eval_indexer = StringIndexer(inputCol='Eval', outputCol='EvalIndex', handleInvalid='keep').fit(df_data) # In[12]: indexer_to_eval = IndexToString(inputCol='prediction', outputCol='Evaluted_Class', labels=eval_indexer.labels) # ## VectorAssembler and Pipeline and CrossValidator # In[13]: feature_set = [cat_col + 'index' for cat_col in cat_cols] # In[14]: assembler = VectorAssembler(inputCols=feature_set, outputCol='features') # In[15]: random_forest_dt = RandomForestClassifier(featuresCol='features',
def my_transform(rdd): with open("./index2whiskey1.json", mode="r", encoding="utf-8") as f: whiskey_list = list(json.loads(f.read()).values()) model = ALSModel.load("hdfs://master/ALSModel1/") spark = SparkSession.builder.appName('sql coming~').getOrCreate() whiskey = rdd.map(lambda x: Row(whiskeyId=int(x[1]), user_name=x[0])) whiskey_df = spark.createDataFrame(whiskey) predict = model.recommendForItemSubset(whiskey_df, 1) df_user = predict.select( predict.whiskeyId, predict.recommendations[0].userId.alias("userId"), ) df_whiskey = model.recommendForUserSubset(df_user, 5) result_df = df_user.join(df_whiskey, on=['userId'], how='left') result_df = result_df.join(whiskey_df, on=['whiskeyId'], how='left') result_df = result_df.select("user_name", result_df["recommendations"][0].whiskeyId.alias("whiskeyId1"), \ result_df["recommendations"][1].whiskeyId.alias("whiskeyId2"), \ result_df["recommendations"][2].whiskeyId.alias("whiskeyId3"), \ result_df["recommendations"][3].whiskeyId.alias("whiskeyId4"), \ result_df["recommendations"][4].whiskeyId.alias("whiskeyId5") \ ) whiskeyId1converter = IndexToString(inputCol="whiskeyId1", outputCol="whiskey1", labels=whiskey_list) whiskeyId2converter = IndexToString(inputCol="whiskeyId2", outputCol="whiskey2", labels=whiskey_list) whiskeyId3converter = IndexToString(inputCol="whiskeyId3", outputCol="whiskey3", labels=whiskey_list) whiskeyId4converter = IndexToString(inputCol="whiskeyId4", outputCol="whiskey4", labels=whiskey_list) whiskeyId5converter = IndexToString(inputCol="whiskeyId5", outputCol="whiskey5", labels=whiskey_list) result_df = whiskeyId1converter.transform(result_df) result_df = whiskeyId2converter.transform(result_df) result_df = whiskeyId3converter.transform(result_df) result_df = whiskeyId4converter.transform(result_df) result_df = whiskeyId5converter.transform(result_df) return result_df.rdd
def Train(self): st = time.time() categorical_columns = self._dataframe_helper.get_string_columns() numerical_columns = self._dataframe_helper.get_numeric_columns() result_column = self._dataframe_context.get_result_column() categorical_columns = [ x for x in categorical_columns if x != result_column ] model_path = self._dataframe_context.get_model_path() pipeline_filepath = model_path + "/LogisticRegression/TrainedModels/pipeline" model_filepath = model_path + "/LogisticRegression/TrainedModels/model" summary_filepath = model_path + "/LogisticRegression/ModelSummary/summary.json" df = self._data_frame pipeline = MLUtils.create_pyspark_ml_pipeline(numerical_columns, categorical_columns, result_column) pipelineModel = pipeline.fit(df) indexed = pipelineModel.transform(df) MLUtils.save_pipeline_or_model(pipelineModel, pipeline_filepath) trainingData, validationData = MLUtils.get_training_and_validation_data( indexed, result_column, 0.8) OriginalTargetconverter = IndexToString( inputCol="label", outputCol="originalTargetColumn") levels = trainingData.select("label").distinct().collect() if self._classifier == "lr": if len(levels) == 2: lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) elif len(levels) > 2: lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, family="multinomial") fit = lr.fit(trainingData) elif self._classifier == "OneVsRest": lr = LogisticRegression() ovr = OneVsRest(classifier=lr) fit = ovr.fit(trainingData) transformed = fit.transform(validationData) MLUtils.save_pipeline_or_model(fit, model_filepath) print fit.coefficientMatrix print fit.interceptVector # feature_importance = MLUtils.calculate_sparkml_feature_importance(indexed,fit,categorical_columns,numerical_columns) label_classes = transformed.select("label").distinct().collect() results = transformed.select(["prediction", "label"]) if len(label_classes) > 2: evaluator = MulticlassClassificationEvaluator( predictionCol="prediction") evaluator.evaluate(results) self._model_summary["model_accuracy"] = evaluator.evaluate( results, {evaluator.metricName: "accuracy"}) # accuracy of the model else: evaluator = BinaryClassificationEvaluator( rawPredictionCol="prediction") evaluator.evaluate(results) # print evaluator.evaluate(results,{evaluator.metricName: "areaUnderROC"}) # print evaluator.evaluate(results,{evaluator.metricName: "areaUnderPR"}) self._model_summary["model_accuracy"] = evaluator.evaluate( results, {evaluator.metricName: "areaUnderPR"}) # accuracy of the model # self._model_summary["feature_importance"] = MLUtils.transform_feature_importance(feature_importance) self._model_summary["runtime_in_seconds"] = round((time.time() - st), 2) transformed = OriginalTargetconverter.transform(transformed) label_indexer_dict = [ dict(enumerate(field.metadata["ml_attr"]["vals"])) for field in transformed.schema.fields if field.name == "label" ][0] prediction_to_levels = udf(lambda x: label_indexer_dict[x], StringType()) transformed = transformed.withColumn( "predictedClass", prediction_to_levels(transformed.prediction)) prediction_df = transformed.select( ["originalTargetColumn", "predictedClass"]).toPandas() objs = { "actual": prediction_df["originalTargetColumn"], "predicted": prediction_df["predictedClass"] } self._model_summary[ "confusion_matrix"] = MLUtils.calculate_confusion_matrix( objs["actual"], objs["predicted"]) overall_precision_recall = MLUtils.calculate_overall_precision_recall( objs["actual"], objs["predicted"]) self._model_summary[ "precision_recall_stats"] = overall_precision_recall[ "classwise_stats"] self._model_summary["model_precision"] = overall_precision_recall[ "precision"] self._model_summary["model_recall"] = overall_precision_recall[ "recall"] self._model_summary["target_variable"] = result_column self._model_summary[ "test_sample_prediction"] = overall_precision_recall[ "prediction_split"] self._model_summary["algorithm_name"] = "Random Forest" self._model_summary["validation_method"] = "Train and Test" self._model_summary["independent_variables"] = len( categorical_columns) + len(numerical_columns) self._model_summary["level_counts"] = CommonUtils.get_level_count_dict( trainingData, categorical_columns, self._dataframe_context.get_column_separator(), dataType="spark") # print json.dumps(self._model_summary,indent=2) self._model_summary["total_trees"] = 100 self._model_summary["total_rules"] = 300 CommonUtils.write_to_file( summary_filepath, json.dumps({"modelSummary": self._model_summary}))
hashtf = HashingTF(numFeatures=2**16, inputCol="ngrams", outputCol="tf") idf = IDF(inputCol="tf", outputCol="features", minDocFreq=5) labels = StringIndexer(inputCol="original", outputCol="label") lines = Pipeline(stages=[tokenizer, ngrams, hashtf, idf, labels]) linesFit = lines.fit(trainSet) trainModel = linesFit.transform(trainSet) validationModel = linesFit.transform(valSet) lr = LogisticRegression(maxIter=100) model = lr.fit(trainModel) predictions = model.transform(validationModel) evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction") predictions.show(30) converter = IndexToString(inputCol="label", outputCol="label meaning") converted = converter.transform(predictions.select("label").distinct()) converted.select("label", "label meaning").distinct().show() truePositive = predictions[(predictions.label == 0) & (predictions.prediction == 0)].count() trueNegative = predictions[(predictions.label == 1) & (predictions.prediction == 1)].count() falsePositive = predictions[(predictions.label == 1) & (predictions.prediction == 0)].count() falseNegative = predictions[(predictions.label == 0) & (predictions.prediction == 1)].count() recall = float(truePositive) / (truePositive + falseNegative) precision = float(truePositive) / (truePositive + falsePositive) print("True Positive", truePositive)
# Automatically identify categorical features, and index them. # Set maxCategories so features with > 4 distinct values are treated as continuous. featureIndexer =\ VectorIndexer(inputCol='features', outputCol='indexedFeatures', maxCategories=2).fit(dataset) # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = dataset.randomSplit([0.7, 0.3]) # Train a RandomForest model. rf = RandomForestClassifier(labelCol='indexedLabel', featuresCol='indexedFeatures', numTrees=10) # Convert indexed labels back to original labels. labelConverter = IndexToString(inputCol='prediction', outputCol='predictedLabel', labels=labelIndexer.labels) # Chain indexers and forest in a Pipeline pipeline = Pipeline( stages=[labelIndexer, featureIndexer, rf, labelConverter]) # Train model. This also runs the indexers. model = pipeline.fit(trainingData) # Make predictions. predictions = model.transform(testData) # Select example rows to display. predictions.select('predictedLabel', 'label', 'features').show(5)
def make_class_model(data, sc, model_path, model_name, target, ml_model='default', save=True): t0 = time() # Stages for pipline stages = [] # Index labels, adding metadata to the label column. # Fit on whole dataset to include all labels in index. targetIndexer = StringIndexer(inputCol="target", outputCol="indexedTarget", handleInvalid="keep").fit(data) stages += [targetIndexer] # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Identify categorical and numerical variables catCols = [ x for (x, dataType) in trainingData.dtypes if (((dataType == "string") | (dataType == "boolean")) & (x != "target")) ] numCols = [ x for (x, dataType) in trainingData.dtypes if ((dataType == "int") | (dataType == "bigint") | (dataType == "float") | (dataType == "double")) ] # OneHotEncode categorical variables indexers = [ StringIndexer(inputCol=column, outputCol=column + "-index", handleInvalid="keep") for column in catCols ] encoder = OneHotEncoder( inputCols=[indexer.getOutputCol() for indexer in indexers], outputCols=[ "{0}-encoded".format(indexer.getOutputCol()) for indexer in indexers ]) assembler_cat = VectorAssembler(inputCols=encoder.getOutputCols(), outputCol="categorical-features", handleInvalid="skip") stages += indexers stages += [encoder, assembler_cat] assembler_num = VectorAssembler(inputCols=numCols, outputCol="numerical-features", handleInvalid="skip") # Standardize numerical variables scaler = StandardScaler(inputCol="numerical-features", outputCol="numerical-features_scaled") # Combine all features in one vector assembler_all = VectorAssembler( inputCols=['categorical-features', 'numerical-features_scaled'], outputCol='features', handleInvalid="skip") stages += [assembler_num, scaler, assembler_all] # Train a RandomForest model by default or another specified model. if ml_model == 'default': rf = RandomForestClassifier(labelCol="indexedTarget", featuresCol="features", numTrees=10) else: rf = ml_model # Convert indexed labels back to original labels. labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel", labels=targetIndexer.labels) stages += [rf, labelConverter] # Chain indexers and forest in a Pipeline pipeline = Pipeline(stages=stages) # Train model. This also runs the indexers. model = pipeline.fit(trainingData) # Make predictions. predictions = model.transform(testData) # Select example rows to display. #predictions.select("predictedLabel", "target", "features").show(5) # Select (prediction, true label) and compute test error evaluator = MulticlassClassificationEvaluator(labelCol="indexedTarget", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Accuracy = %g" % (0.0 + accuracy)) if save: # Final model saving and statistics writing tt = time() - t0 timestamp = int(time()) model.write().overwrite().save(model_path) cluster = Cluster(['127.0.0.1'], "9042") session = cluster.connect("models") query = ( "INSERT INTO %s (model_name, timestamp, target, learning_time, model_path, stat)" ) % ("models_statistics") query = query + " VALUES (%s, %s, %s, %s, %s, %s)" session.execute( query, (model_name, timestamp, target, tt, model_path, accuracy)) session.shutdown() cluster.shutdown() # Stop spark session sc.stop() if not save: return model, sc
#test_rdd = test_transformed.map(lambda data: Vectors.dense([float(c) for c in data])) data_transformed = test_transformed.select(col("Id").alias("label"), col("features")).map(lambda row: LabeledPoint(row.label, row.features)) #Evaluate the model on the training data - output "ID", "prediction" realTest_labelsAndPreds = data_transformed.map(lambda p: (p.label, (float(nb_model.predict(p.features))))) output = sqlContext.createDataFrame(realTest_labelsAndPreds,['id','Category_Index']) #convert back to Categories #you need SPARK1.6 for this #in cmd prompt,type in: sudo yum install spark-core spark-master spark-worker spark-python from pyspark.ml.feature import IndexToString converter = IndexToString(inputCol="Category_Index", outputCol="originalCategory", labels=classifymodel.labels) converted = converter.transform(output) #converted.write.format('com.databricks.spark.csv').save('submission1.csv') def toCSVLine(data): return ','.join(str(d) for d in data) lines = converted.map(toCSVLine) lines.saveAsTextFile('submission1.csv') #view Error rates #realTest_trainErr = realTest_labelsAndPreds.filter(lambda vp: vp[0] != vp[1]).count() / float(test_transformed.count()) #print("Training Error = " + str(realTest_trainErr))
result = [] for rec in x: result.append(url2domain(rec[0])) return result udfFunc = udf(lambda y: array2domain(y), ArrayType(StringType())) domains_df = st.select('uid', udfFunc('visits').alias("urls")) model = PipelineModel.load("lab04/model") indexed = model.transform(domains_df) labels = model.stages[1].labels converter = IndexToString(inputCol="prediction", outputCol="gender_age", labels=labels) converted = converter.transform(indexed) out_df = converted.select("uid", "gender_age") out_columns = list(out_df.columns) query = out_df \ .select(to_json(struct(*out_columns)).alias("value")) \ .writeStream \ .outputMode("update") \ .format("kafka") \ .option("checkpointLocation", "chk_12") \ .option("kafka.bootstrap.servers", kafka_bootstrap ) \ .option("topic", topic_out) \
# one-hot encode categorical features encoder = OneHotEncoder( inputCols=["{}_index".format(col) for col in string_cols], outputCols=one_hot_encoded_features) # assemble all features into feature vector features_assembler = VectorAssembler(inputCols=num_bool_features, outputCol="features") # Index labels, adding metadata to the label column. label_indexer = StringIndexer(inputCol="has_over_50k", outputCol="label").fit(processed_train_set) # Convert indexed labels back to original labels. label_converter = IndexToString(inputCol="prediction", outputCol="predicted_label", labels=label_indexer.labels) # - ChiSQ feature Selection selector = ChiSqSelector(numTopFeatures=20, featuresCol="features", outputCol="featuresSel", labelCol="label") # - RandomForest model with parameter tuning using cross validation rf = RandomForestClassifier(labelCol="label", featuresCol="featuresSel", numTrees=20) # - Create ParamGrid for Cross Validation rf_param_grid = (ParamGridBuilder().addGrid(
def test_attr_spark(self): conf = SparkConf().setAppName("toy_test").setMaster('local[2]') num_partitions = 2 enumerator = "join" model_type = "regression" label = 'target' sparkContext = SparkContext(conf=conf) sqlContext = SQLContext(sparkContext) train_df = sqlContext.read.csv("toy_train.csv", header='true', inferSchema='true') test_df = sqlContext.read.csv("toy.csv", header='true', inferSchema='true') # initializing stages of main transformation pipeline stages = [] # list of categorical features for further hot-encoding cat_features = ['a', 'b', 'c'] for feature in cat_features: string_indexer = StringIndexer(inputCol=feature, outputCol=feature + "_index").setHandleInvalid("skip") encoder = OneHotEncoderEstimator(inputCols=[string_indexer.getOutputCol()], outputCols=[feature + "_vec"]) encoder.setDropLast(False) stages += [string_indexer, encoder] assembler_inputs = [feature + "_vec" for feature in cat_features] assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="assembled_inputs") stages += [assembler] assembler_final = VectorAssembler(inputCols=["assembled_inputs"], outputCol="features") stages += [assembler_final] pipeline = Pipeline(stages=stages) train_pipeline_model = pipeline.fit(train_df) test_pipeline_model = pipeline.fit(test_df) train_df_transformed = train_pipeline_model.transform(train_df) test_df_transformed = test_pipeline_model.transform(test_df) train_df_transformed = train_df_transformed.withColumn('model_type', sf.lit(0)) test_df_transformed = test_df_transformed.withColumn('model_type', sf.lit(0)) decode_dict = {} counter = 0 cat = 0 for feature in cat_features: colIdx = test_df_transformed.select(feature, feature + "_index").distinct().rdd.collectAsMap() colIdx = {k: v for k, v in sorted(colIdx.items(), key=lambda item: item[1])} for item in colIdx: decode_dict[counter] = (cat, item, colIdx[item], counter) counter = counter + 1 cat = cat + 1 train_df_transform_fin = train_df_transformed.select('features', label, 'model_type') test_df_transform_fin = test_df_transformed.select('features', label, 'model_type') lr = LinearRegression(featuresCol='features', labelCol=label, maxIter=10, regParam=0.0, elasticNetParam=0.8) lr_model = lr.fit(train_df_transform_fin) eval = lr_model.evaluate(test_df_transform_fin) f_l2 = eval.meanSquaredError pred = eval.predictions pred_df_fin = pred.withColumn('error', spark_utils.calc_loss(pred[label], pred['prediction'], pred['model_type'])) predictions = pred_df_fin.select('features', 'error').repartition(num_partitions) converter = IndexToString(inputCol='features', outputCol='cats') all_features = list(decode_dict) predictions = predictions.collect() spark_join = spark_slicer.parallel_process(all_features, predictions, f_l2, sparkContext, debug=self.debug, alpha=self.alpha, k=self.k, w=self.w, loss_type=self.loss_type, enumerator="join") spark_union = spark_union_slicer.process(all_features, predictions, f_l2, sparkContext, debug=self.debug, alpha=self.alpha, k=self.k, w=self.w, loss_type=self.loss_type, enumerator="union") self.assertEqual(3, len(spark_join.slices)) print("check1") self.assertEqual(spark_join.min_score, spark_union.min_score) print("check2") self.assertEqual(spark_join.keys, spark_union.keys) print("check3") self.assertEqual(len(spark_join.slices), len(spark_union.slices)) print("check4") idx = -1 for sliced in spark_join.slices: idx += 1 self.assertEqual(sliced.score, spark_union.slices[idx].score) print("check5")
# for item in rel: # print(item) """构建ML的pipeline""" ## 分别获取标签列和特征列,并进行了重命名 label_indexer = StringIndexer().setInputCol("label").setOutputCol( "indexedLabel").fit(df) feature_indexer = VectorIndexer().setInputCol("features").setOutputCol( "indexedFeatures").fit(df) """把数据集分成训练集和测试集""" training_data, test_data = df.randomSplit([0.7, 0.3]) mlr = LogisticRegression().setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures").setMaxIter(10).setRegParam(0.3).setElasticNetParam(0.8) \ .setFamily("multinomial") # print("LogisticRegression parameters:\n" + lr.explainParams) """设置lebelConverter,目的是把预测的类别重新转成字符型""" label_converter = IndexToString().setInputCol("prediction").setOutputCol( "predictionLabel").setLabels(label_indexer.labels) mlr_pipeline = Pipeline().setStages( [label_indexer, feature_indexer, mlr, label_converter]) mlr_pipeline_model = mlr_pipeline.fit(training_data) """pipeline本质上是一个Estimator,当pipeline调用fit()的时候就产生了一个PipelineModel,本质上是一个Transformer。 然后这个PipelineModel就可以调用transform()来进行预测,生成一个新的DataFrame,即利用训练得到的模型对测试集进行验证。""" mlr_predictions = mlr_pipeline_model.transform(test_data) pre_rel = mlr_predictions.select("predictionLabel", "label", "features", "probability").collect() for item in pre_rel: print( str(item['label']) + "," + str(item['features']) + "-->prob" + str(item['probability']) + ",predictedLabel " + str(item['predictionLabel'])) """创建一个MulticlassClassificationEvaluator实例,用setter方法把预测分类的列名和真实分类的列名进行设置,然后计算预测准确率和错误率"""
assembler = VectorAssembler( inputCols=['latitude', 'longitude', 'gps_height', 'construction_year', 'population', 'payment_indexed', 'scheme_management_indexed', 'basin_indexed', 'management_indexed', 'water_quality_indexed', 'quantity_indexed', 'source_indexed', 'extraction_type_indexed', 'waterpoint_type_indexed'], outputCol="features") scaler = StandardScaler(inputCol='features', outputCol='features_scaled', withStd=True, withMean=False) labelIndexer = StringIndexer(inputCol="status_group", outputCol="label").fit(df_train) rf = RandomForestClassifier(labelCol='label', featuresCol='features_scaled', seed=42, maxMemoryInMB=2048) evaluator = MulticlassClassificationEvaluator(metricName='accuracy') # Convert indexed labels back to original labels. labelConverter = IndexToString(inputCol="prediction", outputCol="status_group_prediction", labels=labelIndexer.labels) param_grid = ParamGridBuilder()\ .addGrid(assembler.outputCol, ['features_scaled'])\ .addGrid(rf.maxDepth, [10])\ .addGrid(rf.maxBins, [20])\ .addGrid(rf.minInstancesPerNode, [1])\ .addGrid(rf.minInfoGain, [0.0])\ .addGrid(rf.impurity, ['gini'])\ .addGrid(rf.numTrees, [30])\ .addGrid(rf.featureSubsetStrategy, ['all'])\ .build() pipeline = Pipeline(stages=[paymentIndexer, schemeManagementIndexer, basinIndexer, qualityIndexer, managementIndexer, quantityIndexer, sourceIndexer, extractionTypeIndexer, waterpointTypeIndexer, assembler, labelIndexer, rf, labelConverter]) cross_val = CrossValidator(
def index_to_string(dataset, inputCol): from pyspark.ml.feature import IndexToString return IndexToString(inputCol=inputCol, outputCol=inputCol+'_i2s').transform(dataset)
#(trainingData, testData) = fullData.randomSplit([0.9, 0.1]) #trainingData = trainingData.dropna() #testData = testData.dropna() indexer = StringIndexer(inputCol="category", outputCol="label").fit(fullData) tokenizer = RegexTokenizer(pattern=u'\W+', inputCol="TEXT", outputCol="words", toLowercase=False) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures") idf = IDF(inputCol="rawFeatures", outputCol="features") lr = LogisticRegression(maxIter=20, regParam=0.001) labelConverter = IndexToString(inputCol="prediction", outputCol="originalcategory", labels=indexer.labels) pipeline = Pipeline( stages=[indexer, tokenizer, hashingTF, idf, lr, labelConverter]) model = pipeline.fit(fullData) print("Done training classifier") model.save("/home/jys308/weights") #pred = model.transform(testData) #pl = pred.select("label", "prediction").rdd.cache() #metrics = MulticlassMetrics(pl) #metrics.fMeasure()
print("The deserialized model stages are", model_deserialized.stages) ############################################################################## ## export the final model with mleap ## remove the stringIndexer for the label column so it won't be required for prediction model_final = model.copy() si_label_index = -3 model_final.stages.pop(si_label_index) #si_label ## append an IndexToString transformer to the model pipeline to get the original labels #labelReverse = IndexToString(inputCol = "label", outputCol = "predIncome") #no need to provide labels labelReverse = IndexToString( inputCol="prediction", outputCol="predictedIncome", labels=model.stages[si_label_index].labels ) #must provide labels (from si_label) otherwise will fail model_final.stages.append(labelReverse) pred_final = model_final.transform(test) pred_final.printSchema() pred_final.show(5) # remove an old model file, if needed. if os.path.isfile(model_file): os.remove(model_file) model_final.serializeToBundle(model_file_path, model_final.transform(train)) print("persist the mleap bundle from local to hdfs") from subprocess import Popen, PIPE
# ### Build the feature Vector Assembler # In[9]: assembler = VectorAssembler(outputCol="features", inputCols=list(featureCols)) # ### Convert indexed labels back to original labels # In[10]: predConverter = IndexToString(inputCol="prediction",outputCol="predictedLabel",labels=labelIndexer.labels) # ## Do the Data Preparation # In[11]: labeledData = labelIndexer.transform(df) # TODO add the other additional indexer indexedLabedData = collegeIndexer.transform(labeledData) labeledPointData = assembler.transform(indexedLabedData) # ### Spliting the dataset into train and test set
`petal_length` DOUBLE, `petal_width` DOUBLE, `class` STRING """ df = spark.read.csv(dbfs_file, schema=schema) categoricalCols = ["class"] # The following two lines are estimators. They return functions that we will later apply to transform the dataset. # Convert it to a numeric value using StringIndexer. labelToIndex = StringIndexer(inputCol="class", outputCol="indexed_class") labelIndexer = labelToIndex.fit(df) labelReverser = IndexToString(inputCol="prediction", outputCol="class", labels=labelIndexer.labels) # This includes both the numeric columns and the one-hot encoded binary vector columns in our dataset. numericCols = ["sepal_length", "sepal_width", "petal_length", "petal_width"] vecAssembler = VectorAssembler(inputCols=numericCols, outputCol="features") lr = LogisticRegression(featuresCol="features", labelCol="indexed_class", regParam=1e5) # Define the pipeline based on the stages created in previous steps. pipeline = Pipeline(stages=[labelToIndex, vecAssembler, lr, labelReverser]) # Define the pipeline model.
indexer_acc_fitted = indexer_acc.fit(df) df = indexer_acc_fitted.transform(df) indexer_mer = StringIndexer(inputCol="itemId", outputCol="itemIndex") indexer_mer_fitted = indexer_mer.fit(df) df = indexer_mer_fitted.transform(df) print('############################## - LOADING MODEL - ##############################') model = ALSModel.load('models/moviesrec/') print('############################## - CLASSIFYING DATA') userRecommends = model.recommendForAllUsers(10) userRecommends.show(truncate=False) print('############################## - EXPLODING PREDICTIONS') flatUserRecomends = userRecommends.withColumn('userAndRatings', explode(userRecommends.recommendations)).select('userIndex','userAndRatings.*') flatUserRecomends.show(truncate=False) print('############################## - CONVERTING INDEXES TO STRING') userConverter = IndexToString(inputCol='userIndex', outputCol='userId', labels=indexer_acc_fitted.labels) itemConverter = IndexToString(inputCol='itemIndex', outputCol='itemId', labels=indexer_mer_fitted.labels) convertedMoviesRecs = Pipeline(stages=[userConverter,itemConverter]).fit(df).transform(flatUserRecomends) print('############################## - SAVING DATA') convertedMoviesRecs.write.json('results/usersrec/') #userRecomends.write.format('json').save('/ML/movies/usersrec/') # spark-submit als-model-predictions.py --master yarn --deploy-mode client --num-executors 2 --driver-java-options "-XX:+UseG1GC -XX:ResizePLAB -Xms1g -Xmx1g -XX:InitiatingHeapOccupancyPercent=35" --conf "spark.sql.tungthen.enabled=true" --conf "spark.serializer=org.apache.spark.serializer.KyrioSerializer" --conf "spark.memory.fraction=0.3" --conf "spark.driver.memoryOverhead=2g" --conf "spark.executor.memoryOverhead=1g" --conf "spark.executor.extraJavaOptions -XX:+UseG1GC -XX:ResizePLAB -Xms3g -Xmx3g -XX:InitiatingHeapOccupancyPercent=35 -XX:ConcGCThread=20"
# -*- encoding:utf-8 -*- """ @author: zhouning @file:IndexToString.py @time:2018/8/7 21:09 @desc: 与StringIndexer相对应,IndexToString的作用是把标签索引的一列重新映射回原有的字符型标签。 其主要使用场景一般都是和StringIndexer配合,先用StringIndexer将标签转化成标签索引,进行模型训练, 然后在预测标签的时候再把标签索引转化为原有的字符标签。当然,你也可以另外定义其他的标签 """ from pyspark.ml.feature import StringIndexer, IndexToString from pyspark.sql import SparkSession spark = SparkSession.builder.appName("logistic_regression").getOrCreate() df = spark.createDataFrame([(0, "ab"), (1, "bb"), (2, "cb"), (3, "aa"), (4, "aa"), (5, "ca")], ["id", "category"]) indexer = StringIndexer(inputCol="category", outputCol="categoryIndex") model = indexer.fit(df) indexed = model.transform(df) indexed.show() converter = IndexToString(inputCol="categoryIndex", outputCol="originalCategory") converted = converter.transform(indexed) converted.select("id", "categoryIndex", "originalCategory").show() spark.stop()
from pyspark.ml.linalg import Vectors # #### Data Preparation # Before using any model, the data needs to be organized into a set of 'features' and 'labels'. # In this case, our features are sensor names and their readings, and the label is whether a # particular asset needs maintenance or not. We'll use Spark's feature extraction libraries for this. modelData = rawMeasurements.filter('isMaintenance') si1 = StringIndexer(inputCol='sensor_name', outputCol='sensor_id').fit(modelData).transform(modelData) va = VectorAssembler(inputCols=['sensor_id','value'], outputCol="features").transform(si1) li = StringIndexer(inputCol='asset_name', outputCol='label').fit(va) # #### Model Training # We split the data into 2 subsets - one to train the model, and one to test/evaluate it (trainingData, testData) = va.randomSplit([0.7, 0.3]) rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=10) li2s = IndexToString(inputCol="prediction", outputCol="predictedLabel",labels=li.labels) pipeline = Pipeline(stages=[li, rf, li2s]) model = pipeline.fit(trainingData) # #### Model Evaluation # The training data was used to fit the model (ie. train it), now we can test the model # using the test subset, and calculate the accuracy (ie. false prediction rate) predictions = model.transform(testData) evaluator = MulticlassClassificationEvaluator( labelCol="label", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) # Our model is very accurate, let's visualize the results. A heatmap can show how many correct and incorrect predctions we made predictionResults = predictions.groupBy(predictions.predictedLabel.alias('Prediction'), predictions.asset_name.alias('Actual'))\ .count().toPandas()
from pyspark.ml.feature import StringIndexer lblIndxr = StringIndexer().setInputCol("lab").setOutputCol("labelInd") idxRes = lblIndxr.fit(simpleDF).transform(simpleDF) idxRes.show() # COMMAND ---------- valIndexer = StringIndexer().setInputCol("value1").setOutputCol("valueInd") valIndexer.fit(simpleDF).transform(simpleDF).show(5) # COMMAND ---------- from pyspark.ml.feature import IndexToString labelReverse = IndexToString().setInputCol("labelInd") labelReverse.transform(idxRes).show(5) # COMMAND ---------- from pyspark.ml.feature import VectorIndexer from pyspark.ml.linalg import Vectors idxIn = spark.createDataFrame([(Vectors.dense(1, 2, 3), 1), (Vectors.dense(2, 5, 6), 2), (Vectors.dense(1, 8, 9), 3) ]).toDF("features", "label") indxr = VectorIndexer()\ .setInputCol("features")\ .setOutputCol("idxed")\ .setMaxCategories(2)
map(lambda x: x.split(",")).\ map(lambda x: Row(**f(x))).\ toDF() data.show() labelIndexer = StringIndexer().setInputCol("label").\ setOutputCol("indexedLabel").\ fit(data) featureIndexer = VectorIndexer().setInputCol("features").\ setOutputCol("indexedFeatures").\ setMaxCategories(4).\ fit(data) labelConverter = IndexToString().\ setInputCol("prediction").\ setOutputCol("predictedLabel").\ setLabels(labelIndexer.labels) dc = DecisionTreeClassifier().\ setLabelCol("indexedLabel").\ setFeaturesCol("indexedFeatures") dcPipeline = Pipeline().setStages( [labelIndexer, featureIndexer, dc, labelConverter]) trainingData, testData = data.randomSplit([0.7, 0.3]) dcPipelineModel = dcPipeline.fit(trainingData) dcPredictions = dcPipelineModel.transform(testData) preRel = dcPredictions.select("predictedLabel", "label", "features", "probability").collect()
# Set maxCategories so features with > 4 distinct values are treated as continuous. featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data) # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a RandomForest model. rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=10) # Convert indexed labels back to original labels. labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel", labels=labelIndexer.labels) # Chain indexers and forest in a Pipeline pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf, labelConverter]) # Train model. This also runs the indexers. model = pipeline.fit(trainingData) # Make predictions. predictions = model.transform(testData) # Select example rows to display. predictions.select("predictedLabel", "label", "features").show(5) # Select (prediction, true label) and compute test error
df = label_indexer.transform(df) # only select the features and label column df = df.select(['features', 'label']) print("Reading for machine learning") df.show(10) train, test = df.randomSplit([0.70, 0.30]) test.show() lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) model = lr.fit(train) predictions = model.transform(test) converter = IndexToString(inputCol="label", outputCol="originallabel") converted = converter.transform(predictions) converter = IndexToString(inputCol="prediction", outputCol="prediction_label", labels=user_labels) converted = converter.transform(converted) converted.show(5) customSchema = StructType([ StructField("sepal_length", DoubleType(), True), StructField("sepal_width", DoubleType(), True), StructField("petal_length", DoubleType(), True), StructField("petal_width", DoubleType(), True) ]) myrdd = spark.sparkContext.parallelize([[5.1, 3.5, 1.4, 0.2]])
if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("IndexToStringExample")\ .getOrCreate() # $example on$ df = spark.createDataFrame( [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")], ["id", "category"]) indexer = StringIndexer(inputCol="category", outputCol="categoryIndex") model = indexer.fit(df) indexed = model.transform(df) print("Transformed string column '%s' to indexed column '%s'" % (indexer.getInputCol(), indexer.getOutputCol())) indexed.show() print("StringIndexer will store labels in output column metadata\n") converter = IndexToString(inputCol="categoryIndex", outputCol="originalCategory") converted = converter.transform(indexed) print("Transformed indexed column '%s' back to original string column '%s' using " "labels in metadata" % (converter.getInputCol(), converter.getOutputCol())) converted.select("id", "categoryIndex", "originalCategory").show() # $example off$ spark.stop()
from pyspark.ml.feature import StringIndexer lblIndxr = StringIndexer().setInputCol("lab").setOutputCol("labelInd") idxRes = lblIndxr.fit(simpleDF).transform(simpleDF) idxRes.show() # COMMAND ---------- valIndexer = StringIndexer().setInputCol("value1").setOutputCol("valueInd") valIndexer.fit(simpleDF).transform(simpleDF).show() # COMMAND ---------- from pyspark.ml.feature import IndexToString labelReverse = IndexToString().setInputCol("labelInd") labelReverse.transform(idxRes).show() # COMMAND ---------- from pyspark.ml.feature import VectorIndexer from pyspark.ml.linalg import Vectors idxIn = spark.createDataFrame([ (Vectors.dense(1, 2, 3),1), (Vectors.dense(2, 5, 6),2), (Vectors.dense(1, 8, 9),3) ]).toDF("features", "label") indxr = VectorIndexer()\ .setInputCol("features")\ .setOutputCol("idxed")\