def run(start1, end1, start2, end2, df, sc, sql_context, is_pred): lp_data= get_labeled_points(start1, end2, df, sc, sql_context) print lp_data.count() labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(lp_data) td = labelIndexer.transform(lp_data) label2index = {} for each in sorted(set([(i[0], i[1]) for i in td.select(td.label, td.indexedLabel).distinct().collect()]), key=lambda x: x[0]): label2index[int(each[0])] = int(each[1]) print label2index featureIndexer = \ VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(lp_data) rf = get_model() pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf]) lp_train = lp_data.filter(lp_data.date3<end1).filter(lp_data.is_labeled == 1) model = pipeline.fit(lp_train) lp_check = lp_data.filter(lp_data.date2>start2) predictions = model.transform(lp_check) predictions = val(predictions, label2index, sql_context) if is_pred: predictions = predictions.filter(predictions.is_labeled ==0).filter(predictions.date2 == get_cur()).sort(predictions.prob.desc()) dfToTableWithPar(sql_context, predictions, "predictions", get_cur()) for each in predictions.take(10): print each
def mapClickCategoricalFeatures(): indexed = "" df = getDataFrame(CLICKS_HDPFILEPATH) df.persist(StorageLevel.DISK_ONLY) print df.columns #select columns to be mapped click_cols = ["C2", "C3", "C4", "C5", "C7", "C8"] for col in click_cols: if(indexed == ""): indexed = df print indexed outcol = col+"Index" indexer = StringIndexer(inputCol=col, outputCol=outcol) indexed = indexer.fit(indexed).transform(indexed) indexed.show() indexed.persist(StorageLevel.DISK_ONLY) #indexed.select('C0', 'C1', 'C2Index', 'C3Index', 'C4Index', 'C5Index', 'C6', 'C7Index', 'C8Index').write.format('com.databricks.spark.csv').save(PATH+"extraction/clicks1.csv") indexed.select('C0', 'C1', 'C2Index', 'C3Index', 'C4Index', 'C5Index', 'C6', 'C7Index', 'C8Index').write.format('com.databricks.spark.csv').save(HADOOPDIR+"data/click_fraud/extraction/clicks_23feb12.csv")
def testClassification(data): # Train a GradientBoostedTrees model. stringIndexer = StringIndexer(inputCol="label", outputCol="indexLabel") si_model = stringIndexer.fit(data) td = si_model.transform(data) rf = RandomForestClassifier(numTrees=5, maxDepth=4, labelCol="indexLabel",seed=13) trainData,testData = td.randomSplit([0.8,0.2],13) predictionDF = rf.fit(trainData).transform(testData) selected = predictionDF\ .select('label','indexLabel','prediction','rawPrediction','probability') for row in selected.collect(): print row scoresAndLabels = predictionDF\ .map(lambda x: (float(x.probability.toArray()[1]), x.indexLabel)) for sl in scoresAndLabels.collect(): print sl evaluator = BinaryClassificationEvaluator(labelCol='indexLabel',metricName='areaUnderROC') metric = evaluator.evaluate(selected) print metric
def build_decisionTree(path): df = load_data(path) avg_age=find_avg_age(df) df = data_preparation(df, avg_age) df = df.drop('Cabin') df = df.drop('Ticket') df = df.drop('Name') stringIndexer = StringIndexer(inputCol="Survived", outputCol="indexed") si_model = stringIndexer.fit(df) df = si_model.transform(df) df.show(truncate=False) dt = DecisionTreeClassifier(labelCol='indexed') grid = ParamGridBuilder().addGrid(dt.maxDepth, [1,2,3,5,6,8,10]).build() evaluator = BinaryClassificationEvaluator() cv = CrossValidator(estimator=dt, estimatorParamMaps=grid, evaluator=evaluator) cvModel = cv.fit(df) prediction = cvModel.transform(df) prediction.show(truncate=False) print "classification evaluation :" , evaluator.evaluate(prediction) return cvModel,avg_age
def build_randomForest(path): df = load_data(path) avg_age=find_avg_age(df) df = data_preparation(df, avg_age) df = df.drop('Cabin') df = df.drop('Ticket') df = df.drop('Name') stringIndexer = StringIndexer(inputCol="Survived", outputCol="indexed") si_model = stringIndexer.fit(df) df = si_model.transform(df) df.show() rdf = RandomForestClassifier(labelCol='indexed') grid = ParamGridBuilder().addGrid(rdf.maxDepth, [1,2,3,5,6,8,10])\ .addGrid(rdf.numTrees,[1,5,10,30,50,100,200]).build() evaluator = BinaryClassificationEvaluator() cv = CrossValidator(estimator=rdf, estimatorParamMaps=grid, evaluator=evaluator) cvModel = rdf.fit(df) prediction = cvModel.transform(df) prediction.show() print "classification evaluation :" , evaluator.evaluate(prediction) return cvModel,avg_age
def train_random_forest(df): stringIndexer = StringIndexer(inputCol="label", outputCol="indexed") si_model = stringIndexer.fit(df) td = si_model.transform(df) rf = RandomForestClassifier(numTrees=3, maxDepth=2, labelCol="indexed", seed=int(random.random())) return rf, rf.fit(td)
def mapPublisherCategoricalFeatures(): indexed = "" df = getDataFrame(PUBLISHERS_HDPFILEPATH) df.persist(StorageLevel.DISK_ONLY) print df.columns publisher_cols = ["C0", "C1", "C2", "C3"] for col in publisher_cols: if(indexed == ""): indexed = df print indexed outcol = col+"Index" #stringindexer maps each value in inout colun into a double indexed value and creates a new column in dataframe indexer = StringIndexer(inputCol=col, outputCol=outcol) #fit and transform the columns using indexer indexed = indexer.fit(indexed).transform(indexed) indexed.show() indexed.persist(StorageLevel.DISK_ONLY) indexed.select('C0Index', 'C1Index', 'C2Index', "C3Index").write.format('com.databricks.spark.csv').save(HADOOPDIR+"data/click_fraud/extraction/publishers_23feb12.csv")
def main(sc, spark): # Load and vectorize the corpus corpus = load_corpus(sc, spark) vector = make_vectorizer().fit(corpus) # Index the labels of the classification labelIndex = StringIndexer(inputCol="label", outputCol="indexedLabel") labelIndex = labelIndex.fit(corpus) # Split the data into training and test sets training, test = corpus.randomSplit([0.8, 0.2]) # Create the classifier clf = LogisticRegression( maxIter=10, regParam=0.3, elasticNetParam=0.8, family="multinomial", labelCol="indexedLabel", featuresCol="tfidf") # Create the model model = Pipeline(stages=[ vector, labelIndex, clf ]).fit(training) # Make predictions predictions = model.transform(test) predictions.select("prediction", "indexedLabel", "tfidf").show(5) # Select (prediction, true label) and compute test error evaluator = MulticlassClassificationEvaluator( labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Test Error = %g" % (1.0 - accuracy)) gbtModel = model.stages[2] print(gbtModel) # summary only
def label(df, column): """ Create a labeled column. """ indexer = StringIndexer(inputCol=column, outputCol=column+'_label') df = indexer.fit(df).transform(df) return df
def indexStringColumns(df, cols): #variable newdf will be updated several times newdata = df for c in cols: si = StringIndexer(inputCol=c, outputCol=c+"-x") sm = si.fit(newdata) newdata = sm.transform(newdata).drop(c) newdata = newdata.withColumnRenamed(c+"-x", c) return newdata
def events(df,column_name): i = column_name+"I" v = column_name+"V" stringIndexer = StringIndexer(inputCol=column_name, outputCol=i) model = stringIndexer.fit(df) indexed = model.transform(df) encoder = OneHotEncoder(inputCol=i, outputCol=v) encoded = encoder.transform(indexed) return encoded
def indexStringColumns(df, cols): from pyspark.ml.feature import StringIndexer #variable newdf will be updated several times newdf = df for c in cols: si = StringIndexer(inputCol=c, outputCol=c+"-num") sm = si.fit(newdf) newdf = sm.transform(newdf).drop(c) newdf = newdf.withColumnRenamed(c+"-num", c) return newdf
def oneHotEncoding(self, df, input_col): stringInd = StringIndexer(inputCol=input_col, outputCol="indexed") model = stringInd.fit(df) td = model.transform(df) encoder = OneHotEncoder(inputCol="indexed", outputCol="features", dropLast=False) final_encoding = encoder.transform(td).select(df.id, 'features').cache() conv_udf = udf(lambda line: Vectors.dense(line).tolist()) final_encoding = final_encoding.select(df.id,conv_udf(final_encoding.features).alias("num_"+input_col)).cache() return final_encoding
def test_string_indexer_handle_invalid(self): df = self.spark.createDataFrame([ (0, "a"), (1, "d"), (2, None)], ["id", "label"]) si1 = StringIndexer(inputCol="label", outputCol="indexed", handleInvalid="keep", stringOrderType="alphabetAsc") model1 = si1.fit(df) td1 = model1.transform(df) actual1 = td1.select("id", "indexed").collect() expected1 = [Row(id=0, indexed=0.0), Row(id=1, indexed=1.0), Row(id=2, indexed=2.0)] self.assertEqual(actual1, expected1) si2 = si1.setHandleInvalid("skip") model2 = si2.fit(df) td2 = model2.transform(df) actual2 = td2.select("id", "indexed").collect() expected2 = [Row(id=0, indexed=0.0), Row(id=1, indexed=1.0)] self.assertEqual(actual2, expected2)
def base_features_gen_pipeline(input_descript_col="descript", input_category_col="category", output_feature_col="features", output_label_col="label"): indexer=StringIndexer(inputCol=input_category_col,outputCol=output_label_col) wordtokenizer=Tokenizer(inputCol=input_descript_col,outputCol="words") counter=CountVectorizer(inputCol="words",outputCol=output_feature_col) pipeline=Pipeline(stages=[indexer,wordtokenizer,counter]) return pipeline
from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("DecisionTreeClassificationExample")\ .getOrCreate() # $example on$ # Load the data stored in LIBSVM format as a DataFrame. data = spark.read.format("libsvm").load( "data/mllib/sample_libsvm_data.txt") # Index labels, adding metadata to the label column. # Fit on whole dataset to include all labels in index. labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data) # Automatically identify categorical features, and index them. # We specify maxCategories so features with > 4 distinct values are treated as continuous. featureIndexer =\ VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data) # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a DecisionTree model. dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures") # Chain indexers and tree in a Pipeline pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])
def make_regr_model(data, sc, model_path, model_name, target, ml_model='default', save=True): t0 = time() # Stages for pipline stages = [] # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Identify categorical and numerical variables catCols = [x for (x, dataType) in trainingData.dtypes if ((dataType == "string") | (dataType == "boolean"))] numCols = [x for (x, dataType) in trainingData.dtypes if (((dataType == "int") | (dataType == "bigint") | (dataType == "float") | (dataType == "double")) & (x != "target"))] # OneHotEncode categorical variables indexers = [StringIndexer(inputCol=column, outputCol=column + "-index", handleInvalid="keep") for column in catCols] encoder = OneHotEncoder( inputCols=[indexer.getOutputCol() for indexer in indexers], outputCols=["{0}-encoded".format(indexer.getOutputCol()) for indexer in indexers] ) assembler_cat = VectorAssembler( inputCols=encoder.getOutputCols(), outputCol="categorical-features", handleInvalid="skip" ) stages += indexers stages += [encoder, assembler_cat] assembler_num = VectorAssembler( inputCols=numCols, outputCol="numerical-features", handleInvalid="skip" ) # Standardize numerical variables scaler = StandardScaler(inputCol="numerical-features", outputCol="numerical-features_scaled") # Combine all features in one vector assembler_all = VectorAssembler( inputCols=['categorical-features', 'numerical-features_scaled'], outputCol='features', handleInvalid="skip" ) stages += [assembler_num, scaler, assembler_all] # Train a RandomForest model. if ml_model == 'default': rf = RandomForestRegressor(labelCol="target", featuresCol="features") else: rf = ml_model stages += [rf] # Chain indexers and forest in a Pipeline pipeline = Pipeline(stages=stages) # Train model. This also runs the indexers. model = pipeline.fit(trainingData) # Make predictions. predictions = model.transform(testData) # Select example rows to display. #predictions.select("prediction", "target", "features").show(5) # Select (prediction, true label) and compute test error evaluator = RegressionEvaluator( labelCol="target", predictionCol="prediction", metricName="rmse") rmse = evaluator.evaluate(predictions) print("RMSE = %g" % (0.0 + rmse)) if save: # Final model saving and statistics writing tt = time() - t0 timestamp = int(time()) model.write().overwrite().save(model_path) cluster = Cluster(['127.0.0.1'], "9042") session = cluster.connect("models") query = ("INSERT INTO %s (model_name, timestamp, target, learning_time, model_path, stat)") % ("models_statistics") query = query + " VALUES (%s, %s, %s, %s, %s, %s)" session.execute(query, (model_name, timestamp, target, tt, model_path, rmse)) session.shutdown() cluster.shutdown() # Stop spark session sc.stop() if not save: return model, sc
spark = SparkSession.builder.master('spark://node01:7077').appName( 'learn_ml').getOrCreate() # 载入数据 df = spark.read.csv('hdfs://node01:9000/mushrooms.csv', header=True, inferSchema=True, encoding='utf-8') # 先使用StringIndexer将字符转化为数值,然后将特征整合到一起 from pyspark.ml.feature import StringIndexer, VectorAssembler old_columns_names = df.columns print(old_columns_names) new_columns_names = [name + '-new' for name in old_columns_names] for i in range(len(old_columns_names)): indexer = StringIndexer(inputCol=old_columns_names[i], outputCol=new_columns_names[i]) df = indexer.fit(df).transform(df) vecAss = VectorAssembler(inputCols=new_columns_names[1:], outputCol='features') df = vecAss.transform(df) # 更换label列名 df = df.withColumnRenamed(new_columns_names[0], 'label') # 创建新的只有label和features的表 data = df.select(['label', 'features']) # 数据概观 print(data.show(5, truncate=0)) # 将数据集分为训练集和测试集 train_data, test_data = data.randomSplit([4.0, 1.0], 100)
def get_all_cat_cols_indexed(t_data, index_col): indexer = StringIndexer(inputCol=t_data.columns[0], outputCol=index_col).setHandleInvalid("keep") t_data = indexer.fit(t_data).transform(t_data) t_data = t_data.select(index_col) return t_data
# there are some null values in this dataset, Remove it df = df.dropna() ''' Lets considers a flight to be "delayed" if it arrives late after 15 minutes than it was sheduled. Also, the mile column has data into miles, so lets change this to KM. ''' df_withKM = df.withColumn('KM', round(df.mile * 1.60934, 0)).drop('mile') # create a extra column called label based on whether there has been delay or not and assigned to 0 and 1 flight_data = df_withKM.withColumn('label', (df_withKM.delay >= 15).cast('integer')) flight_data = flight_data.drop('delay') # Change those catagorical text columns which is going to be used in ML to catagorical numerical columns indexers = StringIndexer(inputCol='carrier', outputCol='carrier_idx') # do one hot coding for catagorical numerical columns onehot = OneHotEncoderEstimator( inputCols=['dow', 'carrier_idx'], outputCols=['dow_d', 'carrier_d'] ) # Extract the features columns assembler = VectorAssembler(inputCols=['mon', 'dom', 'dow_d', 'carrier_d', 'KM', 'duration'], outputCol='features') # split data into train, test using 80% train and 20% test. Also assigns the random speed x_train, x_test = flight_data.randomSplit([0.80, 0.20], seed=42)
.map(lambda words: Row(label=words[0], words=words[1:])) return spark.createDataFrame(rdd) # load dataframes train_data = load_dataframe("Spark/data/20ng-train-all-terms.txt") test_data = load_dataframe("Spark/data/20ng-test-all-terms.txt") # Count word frequency vectorizer = CountVectorizer(inputCol="words", outputCol="bag_of_words") vectorizer_transformer = vectorizer.fit(train_data) train_bag_of_words = vectorizer_transformer.transform(train_data) test_bag_of_words = vectorizer_transformer.transform(test_data) # Create numeric labels label_indexer = StringIndexer(inputCol="label", outputCol="label_index") label_indexer_transformer = label_indexer.fit(train_bag_of_words) train_bag_of_words = label_indexer_transformer.transform(train_bag_of_words) test_bag_of_words = label_indexer_transformer.transform(test_bag_of_words) # Create classifier classifier = NaiveBayes(labelCol="label_index", featuresCol="bag_of_words", predictionCol="label_index_predicted") # Train model classifier_transformer = classifier.fit(train_bag_of_words) test_predicted = classifier_transformer.transform(test_bag_of_words) test_predicted.select("label_index", "label_index_predicted").limit(10).show()
# 如果表字段只要有一个是空的,我都过滤掉: q_l_df.na.drop() q_l_df.na.drop(thresh=1) # 只要一个为空就过滤掉 # 或者我填充下缺失值 q_l_df_fix = q_l_df.na.fill({'county': "未收集"}) # 看下效果 q_l_df.na.fill({'county': "未收集"}). \ where(f.col("county").isNull()).count() # 其实长度也有为null的 # q_l_df.where(f.col("Residential").isNull()).select(f.length("Residential")).show() # 把某个字段转化为一个数字 string_index = StringIndexer(inputCol="county", outputCol="county_number") q_l2_df = string_index.fit(q_l_df_fix).transform(q_l_df_fix) # .select(f.col("section_number"), f.col("section")) # 看看现在的表结构 q_l2_df.printSchema() # 恩,我想知道两个字段没有关联关系 q_l2_df.corr("q_l", "county_number") # 算两个字段的方差 q_l2_df.cov("q_l", "county_number") # 我想知道现在某个字段分布 q_l2_df.groupBy("county").agg(f.count("county").alias("c")).orderBy( f.desc("c")) # 中位分布
"inferSchema", True).option("delimiter", ",").load("adult.csv") data = data.withColumnRenamed("age", "label").select( "label", col("education-num").alias("education-num"), col(" hours-per-week").alias("hours-per-week"), col(" education").alias("education"), col(" fnlwgt").alias("fnlwgt"), col(" sex").alias("sex"), col(" relationship").alias("relationship")) data = data.select(data.label.cast("double"), "education-num", "hours-per-week", "education", "sex", "fnlwgt", "relationship") new_data = data.toDF("label", "education-num", "hours-per-week", "education", "sex", "fnlwgt", "relationship") indexer = StringIndexer(inputCol="education", outputCol="new_education") indexed = indexer.fit(new_data).transform(new_data) indexer1 = StringIndexer(inputCol="sex", outputCol="new_sex") indexed1 = indexer1.fit(indexed).transform(indexed) indexer2 = StringIndexer(inputCol="relationship", outputCol="new_rel") indexed2 = indexer2.fit(indexed1).transform(indexed1) indexed2 = indexed2.drop("sex", "education", "relationship") indexed2.show() # Create vector assembler for feature columns assembler = VectorAssembler(inputCols=indexed2.columns[1:], outputCol="features") data = assembler.transform(indexed2)
# The following code does three things with pipeline: # # * **`StringIndexer`** all categorical columns # * **`OneHotEncoder`** all categorical index columns # * **`VectorAssembler`** all feature columns into one vector column # ### Categorical columns from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler from pyspark.ml import Pipeline import pyspark.sql.functions as F # categorical columns categorical_columns = cuse.columns[0:3] stage_string = [StringIndexer(inputCol= c, outputCol= c+"_string_encoded") for c in categorical_columns] stage_one_hot = [OneHotEncoder(inputCol= c+"_string_encoded", outputCol= c+ "_one_hot") for c in categorical_columns] ppl = Pipeline(stages=stage_string + stage_one_hot) df = ppl.fit(cuse).transform(cuse) df.toPandas().to_csv('cuse_afterTransform.csv') df.select("age", 'age_string_encoded').distinct().sort(F.asc("age_string_encoded")).show() df.select("education").distinct().show() df.select("wantsMore").distinct().show() # In[2]: # ### Build VectorAssembler stage df.columns assembler = VectorAssembler( inputCols=['age_one_hot',
df_node2 = df_node1.dropna(subset=[ "PassengerId", "Survived", "Pclass", "Name", "Sex", "Age", "SibSp", "Parch", "Ticket", "Fare", "Cabin", "Embarked" ], how="any", thresh=12) df_node3 = df_node2.randomSplit(seed=1234, weights=[0.7, 0.3]) mmi_value_0_node4 = ["Sex", "Embarked", "Survived"] mmi_value_1_node4 = ["indexedSex", "indexedEmbarked", "indexedSurvived"] stages_node4 = [] for i in range(len(mmi_value_0_node4)): stages_node4.append( StringIndexer(inputCol=mmi_value_0_node4[i], outputCol=mmi_value_1_node4[i], handleInvalid="error", stringOrderType="frequencyDesc")) mmi_value_0_node5 = ["indexedSex", "indexedEmbarked"] mmi_value_1_node5 = ['sexVec', 'embarkedVec'] stages_node5 = [] for i in range(len(mmi_value_0_node5)): stages_node5.append( OneHotEncoder(inputCol=mmi_value_0_node5[i], outputCol=mmi_value_1_node5[i])) pipeline_stage_node6 = VectorAssembler( outputCol="features", inputCols=["Pclass", "sexVec", "Age", "SibSp", "Fare", "embarkedVec"]) pipeline_stage_node7 = RandomForestClassifier(featureSubsetStrategy="auto", numTrees=20,
CASE WHEN (pickup_hour <= 6 OR pickup_hour >= 20) THEN "Night" WHEN (pickup_hour >= 7 AND pickup_hour <= 10) THEN "AMRush" WHEN (pickup_hour >= 11 AND pickup_hour <= 15) THEN "Afternoon" WHEN (pickup_hour >= 16 AND pickup_hour <= 19) THEN "PMRush" END as TrafficTimeBins FROM taxi_test """ taxi_df_test_with_newFeatures = sqlContext.sql(sqlStatement) ## CACHE DATA-FRAME IN MEMORY & MATERIALIZE DF IN MEMORY taxi_df_test_with_newFeatures.cache() taxi_df_test_with_newFeatures.count() ## INDEX AND ONE-HOT ENCODING stringIndexer = StringIndexer(inputCol="vendor_id", outputCol="vendorIndex") model = stringIndexer.fit(taxi_df_test_with_newFeatures) # Input data-frame is the cleaned one from above indexed = model.transform(taxi_df_test_with_newFeatures) encoder = OneHotEncoder(dropLast=False, inputCol="vendorIndex", outputCol="vendorVec") encoded1 = encoder.transform(indexed) stringIndexer = StringIndexer(inputCol="rate_code", outputCol="rateIndex") model = stringIndexer.fit(encoded1) indexed = model.transform(encoded1) encoder = OneHotEncoder(dropLast=False, inputCol="rateIndex", outputCol="rateVec") encoded2 = encoder.transform(indexed) stringIndexer = StringIndexer(inputCol="payment_type", outputCol="paymentIndex") model = stringIndexer.fit(encoded2) indexed = model.transform(encoded2) encoder = OneHotEncoder(dropLast=False, inputCol="paymentIndex", outputCol="paymentVec")
# In[326]: print "Creating sparse vectors for all data based on this new dictionary" t0 = time() dfTrainSelect=dfTrain.map(partial(vectorizeBi,dico=dictSel_broad.value)).toDF(schema) dfTestSelect=dfTest.map(partial(vectorizeBi,dico=dictSel_broad.value)).toDF(schema) dfTrainSelect.take(1) dfTestSelect.take(1) tt = time() - t0 print "Done in {} second".format(round(tt,3)) # In[328]: from pyspark.ml.feature import StringIndexer string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed') string_indexer_model = string_indexer.fit(dfTrainSelect) dfTrainIndexed = string_indexer_model.transform(dfTrainSelect) # In[329]: from pyspark.ml.classification import DecisionTreeClassifier dt = DecisionTreeClassifier(featuresCol='bigramVectors', labelCol='target_indexed', maxDepth=10) # In[330]: from pyspark.ml.evaluation import MulticlassClassificationEvaluator evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision')
def main(): # Unzip file on a temporary folder unzip_files() if files_source == "hdfs": training_data = spark.read.load( "/tmp/training.1600000.processed.noemoticon.csv", format="csv") elif files_source == "local": training_data = spark.read.load( "tmp/training.1600000.processed.noemoticon.csv", format="csv") training_data = training_data.withColumnRenamed("_c0", "label") \ .withColumnRenamed("_c1", "tweet_id") \ .withColumnRenamed("_c2", "date") \ .withColumnRenamed("_c3", "query") \ .withColumnRenamed("_c4", "user") \ .withColumnRenamed("_c5", "tweet") # Load the amount of lines for training defined on arg sample_size. If equals zero, use the whole dataset if sample_size > 0: training_data = training_data.sample(sample_size / training_data.count()) ## Preprocess dataset training_data = training_data.select(functions.col("label"), functions.col("tweet")) # Run the cleansing UDF for tweet column udf_cleansing = functions.udf(cleansing) training_data = training_data.withColumn( "tweet_cleansed", udf_cleansing(functions.col("tweet"))) # Tokenizing from pyspark.ml.feature import Tokenizer tokenizer = Tokenizer(inputCol="tweet_cleansed", outputCol="words") training_data = tokenizer.transform(training_data) # Generating features from pyspark.ml.feature import HashingTF hashingTF = HashingTF(inputCol="words", outputCol="features") training_data = hashingTF.transform(training_data) # Generate label indexes from pyspark.ml.feature import StringIndexer stringIndexer = StringIndexer(inputCol="label", outputCol="labelIndex") model = stringIndexer.fit(training_data) training_data = model.transform(training_data) # Split dataset into training and test according to test_size_frac arg training, test = training_data.randomSplit( [1 - test_size_fraction, test_size_fraction]) # Training the model from pyspark.ml.classification import NaiveBayes #Naive bayes nb = NaiveBayes(featuresCol="features", labelCol="labelIndex", predictionCol="NB_pred", probabilityCol="NB_prob", rawPredictionCol="NB_rawPred") nbModel = nb.fit(training) cv = nbModel.transform(test) total = cv.count() correct = cv.where(cv['labelIndex'] == cv['NB_pred']).count() accuracy = correct / total # Saving trained model for usage in a Pipeline (so you don't need to re-train everytime you need to use it) model_folder = os.path.join(base_folder, 'saved_models') print(model_folder) if not os.path.exists(model_folder): os.makedirs(model_folder) model_full_path = os.path.join(model_folder, "twitter_sentiment_spark") if files_source == "hdfs": model_full_path = "file://" + model_full_path nbModel.write().overwrite().save(model_full_path) # Save Labels reference table labels = cv.select("labelIndex", "label").distinct() \ .withColumnRenamed("label", "label_predicted") \ .withColumnRenamed("labelIndex", "label_id") labels.toPandas().to_csv(os.path.join(model_folder, "labels.csv"), index=False) # Save evaluations sys.stdout = open(os.path.join(model_folder, "evaluation.txt"), "w") print("\nTotal:", total, "\nCorrect:", correct, "\nAccuracy:", accuracy) sys.stdout.close() # Delete temporary folder if os.path.exists(temporary_folder): shutil.rmtree(temporary_folder)
.master("local") \ .appName("KaggleLab") \ .getOrCreate() trainingData = spark.read.csv( "/Users/thai-anthantrong/Documents/MS_BIG_DATA/Cours/SD701/KaggleLab/train-data.csv", header=True, inferSchema=True) testData = spark.read.csv( "/Users/thai-anthantrong/Documents/MS_BIG_DATA/Cours/SD701/KaggleLab/test-data.csv", header=True, inferSchema=True) # stages in our Pipeline stages = [] # Index labels, adding metadata to the label column. # Fit on whole trainingData to include all labels in index. labelIndexer = StringIndexer(inputCol="Cover_Type", outputCol="label").fit(trainingData) stages += [labelIndexer] # Convert indexed labels back to original labels. labelConverter = IndexToString(inputCol="prediction", outputCol="Cover_Type_pred", labels=labelIndexer.labels) # All columns all_cols = ["Elevation", "Aspect", "Slope", "Horizontal_Distance_To_Hydrology", "Vertical_Distance_To_Hydrology", "Horizontal_Distance_To_Roadways", "Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm", "Horizontal_Distance_To_Fire_Points", "Wilderness_Area1", "Wilderness_Area2", "Wilderness_Area3", "Wilderness_Area4", "Soil_Type1", "Soil_Type2", "Soil_Type3", "Soil_Type4", "Soil_Type5", "Soil_Type6", "Soil_Type7", "Soil_Type8", "Soil_Type9", "Soil_Type10", "Soil_Type11", "Soil_Type12", "Soil_Type13", "Soil_Type14", "Soil_Type15", "Soil_Type16", "Soil_Type17", "Soil_Type18", "Soil_Type19", "Soil_Type20", "Soil_Type21", "Soil_Type22", "Soil_Type23", "Soil_Type24", "Soil_Type25", "Soil_Type26", "Soil_Type27", "Soil_Type28", "Soil_Type29", "Soil_Type30", "Soil_Type31", "Soil_Type32", "Soil_Type33", "Soil_Type34", "Soil_Type35", "Soil_Type36", "Soil_Type37", "Soil_Type38", "Soil_Type39", "Soil_Type40"]
# In[16]: print "Creating feature vectors" t0 = time() dfTrainVec=dfTrain.map(partial(vectorize,dicoUni=dict_broad.value,dicoTri=dictTri_broad.value)).toDF(schema) dfTestVec=dfTest.map(partial(vectorize,dicoUni=dict_broad.value,dicoTri=dictTri_broad.value)).toDF(schema) tt = time() - t0 print "Dataframe created in {} second".format(round(tt,3)) # In[19]: print "Indexing labels" t0 = time() from pyspark.ml.feature import StringIndexer string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed') string_indexer_model = string_indexer.fit(dfTrainVec) dfTrainIdx = string_indexer_model.transform(dfTrainVec) dfTrainIdx.take(1) tt = time() - t0 print "Done in {} second".format(round(tt,3)) # In[20]: from pyspark.ml.classification import DecisionTreeClassifier dt = DecisionTreeClassifier(featuresCol='featureVectors', labelCol='target_indexed', maxDepth=10) # In[21]:
# MAGIC # MAGIC For simplicity's sake, we will use One-Hot Encoding to convert all categorical variables into binary vectors. We will use a combination of StringIndexer and OneHotEncoderEstimator to convert the categorical variables. The `OneHotEncoderEstimator` will return a `SparseVector`. # MAGIC # MAGIC Since we will have more than 1 stage of feature transformations, we use a Pipeline to tie the stages together. This simplifies our code. # COMMAND ---------- # MAGIC %md The ML package needs the label and feature vector to be added as columns to the input dataframe. We set up a pipeline to pass the data through transformers in order to extract the features and label. We index each categorical column using the `StringIndexer` to a column of number indices, then convert the indexed categories into one-hot encoded variables with at most a single one-value. These binary vectors are appended to the end of each row. Encoding categorical features allows decision trees to treat categorical features appropriately, improving performance. We then use the `StringIndexer` to encode our labels to label indices. # COMMAND ---------- categoricalColumns = ["OriginAirportCode", "Carrier", "DestAirportCode"] stages = [] # stages in our Pipeline for categoricalCol in categoricalColumns: # Category Indexing with StringIndexer stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "Index") # Use OneHotEncoderEstimator to convert categorical variables into binary SparseVectors # encoder = OneHotEncoderEstimator(dropLast=False, inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"]) # Using the slightly older OneHotEncoder (instead of OneHotEncoderEstimator) for compatibility reasons when operationalizing within the DSVM encoder = OneHotEncoder(inputCol=stringIndexer.getOutputCol(), outputCol=categoricalCol + "classVec") # Add stages. These are not run here, but will run all at once later on. stages += [stringIndexer, encoder] # Convert label into label indices using the StringIndexer label_stringIdx = StringIndexer(inputCol="DepDel15", outputCol="label") stages += [label_stringIdx] # COMMAND ---------- # MAGIC %md Now we need to use the `VectorAssembler` to combine all the feature columns into a single vector column. This includes our numeric columns as well as the one-hot encoded binary vector columns.
from pyspark.mllib.linalg import VectorUDT from pyspark.sql.types import StructType, StructField,DoubleType schema = StructType([StructField('label',DoubleType(),True),StructField('bigramVectors',VectorUDT(),True)]) features=dfBigram.map(partial(vectorizeBi,dico=dict_broad.value)).toDF(schema) print "Features from bigrams created" from pyspark.ml.feature import StringIndexer from pyspark.ml.classification import DecisionTreeClassifier string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed') string_indexer_model = string_indexer.fit(features) featIndexed = string_indexer_model.transform(features) print "labels indexed" dt = DecisionTreeClassifier(featuresCol='bigramVectors', labelCol=string_indexer.getOutputCol(), maxDepth=10) from pyspark.ml.evaluation import MulticlassClassificationEvaluator evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision') from pyspark.ml.tuning import ParamGridBuilder from pyspark.ml.tuning import CrossValidator
# we can remove the header by setting a value equal to the first row of the rdd # and then filter it out. then we define schema from the rdd and use that to make # a dataframe data = sc.textFile( "hdfs://ch1-hadoop-ns/user/bcraft/boosted_trees_test_data.csv").map( lambda line: line.split(',')) header = data.first() data = data.filter(lambda row: row != header) schema = data.map(lambda x: Row(id=x[0], make=x[1], vdps=x[2], label=x[3])) df = sqlContext.createDataFrame(schema) # string indexer for our categorical features # this indexes each categorical feature and we will # save them in a data frame that maps the make name to the string # for persistence purposes indexer = StringIndexer(inputCol="make", outputCol="makeIDX") df = indexer.fit(df).transform(df) make_idx_mappings = df.select('make', 'makeIDX').distinct().show() # one hot encoder # this will convert the indexed strings to sparse one hot vectors # think of this as dummy feature creation encoder = OneHotEncoder(inputCol="makeIDX", outputCol="make_sparse_vect") df = encoder.transform(df) # spark models expect to see a feature vector and a prediction column # so we need to put all our features into a vector, in this case # the sparse vector and vdp count, we also have to do some # data type transformations from string to double df = df.withColumn("vdp_int", df["vdps"].cast("double")) df = df.withColumn("label_int", df["label"].cast("double"))
r.stars, r.attributes['Price Range'], False if r.attributes['Good For'] is None else r.attributes['Good For']['dinner'], False if r.attributes['Good For'] is None else r.attributes['Good For']['lunch'], False if r.attributes['Good For'] is None else r.attributes['Good For']['breakfast'], False if r.attributes['Ambience'] is None else r.attributes['Ambience']['romantic'], False if r.attributes['Ambience'] is None else r.attributes['Ambience']['upscale'], False if r.attributes['Ambience'] is None else r.attributes['Ambience']['casual'], False if (r.attributes['Alcohol'] is None or r.attributes['Alcohol'] == 'none') else True, False if r.attributes['Take-out'] is None else r.attributes['Take-out']] ).toDF(clustering_columns) # drop row with null values lv_clustering_data = lv_clustering_data.dropna() #Neighborhood feature engineering stringIndexer = StringIndexer(inputCol="neighborhood", outputCol="neigh_index") lv_model = stringIndexer.fit(lv_clustering_data) lv_indexed = lv_model.transform(lv_clustering_data) encoder = OneHotEncoder(dropLast=False, inputCol="neigh_index", outputCol="neigh_vec") lv_encoded = encoder.transform(lv_indexed) #initial feature set # assembler = VectorAssembler( # inputCols=["stars", "price_range", "neigh_vec"], # outputCol="features_vec") #expanded feature set feature_columns = clustering_columns[2:] feature_columns.append("neigh_vec") assembler = VectorAssembler( inputCols=feature_columns,
("null", "unknow", "", "None") or x == None else x) #splitCalUDF = F.udf(lambda x : float(x.split("*")[0])*float(x.split("*")[1]), returnType=StringType()) #缺失处理 data = data.withColumn("gender", cleanStringUDF("gender")) # .withColumn("religiousness",splitCalUDF("religiousness")) #类型处理 feature1_list = [ 'age', 'label', 'religiousness', 'education', 'occupation', 'rating' ] feature2_list = ['gender', 'children'] for c in feature1_list: data = data.withColumn(c, data[c].cast(DoubleType())) indexers = [ StringIndexer(inputCol=c, outputCol='{0}_indexed'.format(c), handleInvalid='error') for c in feature2_list ] encoders = [ OneHotEncoder(dropLast=True, inputCol=indexer.getOutputCol(), outputCol="{0}_encoded".format(indexer.getOutputCol())) for indexer in indexers ] assembler = VectorAssembler(inputCols=feature1_list + [encoder.getOutputCol() for encoder in encoders], outputCol="features") feature_pipeline = Pipeline(stages=indexers + encoders + [assembler]) feature_model = feature_pipeline.fit(data) #index y
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler from pyspark.ml import Pipeline categoricalColumns = ["UNIQUE_CARRIER", "ORIGIN", "DEST"] numericalColumns = ["DISTANCE"] # Convert string categorical columns to indexed integers indexers = [ StringIndexer(inputCol=c, outputCol ="{0}_indexed".format(c)) for c in categoricalColumns ] # OneHot Encoding encoders = [ OneHotEncoder( inputCol=indexer.getOutputCol(), outputCol ="{0}_encoded".format(indexer.getOutputCol()) ) for indexer in indexers ] # Assembler for categorical columns assemblerCategorical = VectorAssembler(inputCols=[encoder.getOutputCol() for encoder in encoders], outputCol= "cat") stages = indexers+encoders+ [assemblerCategorical] pipelineCategorical = Pipeline(stages=stages) df = pipelineCategorical.fit(df).transform(df) # Assembler for Numerical columns assemblerNumerical = VectorAssembler(inputCols = numericalColumns, outputCol = "num") pipelineNumerical = Pipeline(stages = [assemblerNumerical]) df = pipelineNumerical.fit(df).transform(df)
data.append(tmp) print(len(data)) df = sqlContext.createDataFrame(data, schema=["category", "text"]) # regular expression tokenizer regex_tokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W") # stop words stop_words = list(set(stopwords.words('english'))) stop_words_remover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(stop_words) # bag of words count count_vectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5) label_string_index = StringIndexer(inputCol="category", outputCol="label") label_string_index.setHandleInvalid("keep") pipeline = Pipeline(stages=[regex_tokenizer, stop_words_remover, count_vectors, label_string_index]) (training_data, test_data) = df.randomSplit([0.8, 0.2], seed=100) pipeline_fit = pipeline.fit(training_data) pipeline_fit.save("rf_pipeline") training_data_set = pipeline_fit.transform(training_data) training_data_set.show(5) # stages = pipeline_fit.stages # vec = [s for s in stages if isinstance(s, CountVectorizerModel)] # v1 = vec[0].vocabulary # print(len(v1))
data = sc.textFile('/home/minglu/dist_spark/data/' + csv_file) # This is an RDD, which will later be transformed to a data frame data = data.filter(lambda x:x.split(',')[0] != 'label').map(lambda line: line.split(',')) if train: data = data.map( lambda line: (Vectors.dense(np.asarray(line[1:]).astype(np.float32)), 'class_'+str(line[0]),int(line[0])) ) else: # Test data gets dummy labels. We need the same structure as in Train data data = data.map( lambda line: (Vectors.dense(np.asarray(line[1:]).astype(np.float32)),'class_'+str(line[0]),int(line[0])) ) return sqlcontext.createDataFrame(data, ['features', 'category','label']) train_df = load_data_frame("train.csv") test_df = load_data_frame("test.csv", shuffle=False, train=False) from pyspark.ml.feature import StringIndexer string_indexer = StringIndexer(inputCol="category", outputCol="index_category") fitted_indexer = string_indexer.fit(train_df) indexed_df = fitted_indexer.transform(train_df) from distkeras.transformers import * from pyspark.ml.feature import OneHotEncoder ####OneHot nb_classes = 9 encoder = OneHotTransformer(nb_classes, input_col='label', output_col="label_encoded") dataset_train = encoder.transform(indexed_df) dataset_test = encoder.transform(test_df) ###encoder from pyspark.ml.feature import MinMaxScaler transformer = MinMaxTransformer(n_min=0.0, n_max=1.0, \ o_min=0.0, o_max=250.0, \
print("The data contains %d records." % cars.count(), '\n') cars = cars.withColumnRenamed("ncyl", "cyl") cars = cars.withColumn('length_meters', round(cars.length * 0.0254, 3)) cars = cars.withColumn('weight_kg', round(cars.weight / 2.205, 0)) cars = cars.withColumn('avg_mpg', round((cars.city_mpg + cars.hwy_mpg) / 2, 1)) \ .drop("city_mpg", "hwy_mpg") cars = cars.withColumn( 'consumption', round((100 * 3.785411784) / (cars.avg_mpg * 1.609344), 2)) print("Cars with null cyl", cars.filter('cyl IS NULL').count(), '\n') indexer = StringIndexer(inputCol='origin', outputCol='origin_idx') # Assign index values to strings indexer = indexer.fit(cars) # Create column with index values cars = indexer.transform(cars) pd.set_option('display.max_columns', None) # all cols pd.set_option('display.width', 161) print(cars.toPandas().sample(12)) print(indexer) # View the first five records cars.sample(False, .25).show() # Check column data types
from pyspark.sql.types import StringType df = df.withColumn("Pclass", df["Pclass"].cast(StringType())) df = df.withColumn("SibSp", df["SibSp"].cast(StringType())) df = df.withColumn("Parch", df["Parch"].cast(StringType())) df = df.withColumn("Relatives", df["Relatives"].cast(StringType())) from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler # Categorical features categoricalColumns = [ 'Sex', 'Embarked', 'Pclass', 'SibSp', 'Parch', 'Relatives' ] stages = [] for categoricalCol in categoricalColumns: stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + 'Index') encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"]) stages += [stringIndexer, encoder] # Numeric features numericCols = ['Age', 'Fare'] assemblerInputs = [c + "classVec" for c in categoricalColumns] + numericCols assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features") stages += [assembler] # Pipeline from pyspark.ml import Pipeline pipeline = Pipeline(stages=stages) pipelineModel = pipeline.fit(df)
def main(base_path): APP_NAME = "train_spark_mllib_model.py" # If there is no SparkSession, create the environment try: sc and spark except NameError as e: import findspark findspark.init() import pyspark import pyspark.sql sc = pyspark.SparkContext() spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate() # # { # "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00", # "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0, # "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS" # } # from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField from pyspark.sql.functions import udf schema = StructType([ StructField("ArrDelay", DoubleType(), True), StructField("CRSArrTime", TimestampType(), True), StructField("CRSDepTime", TimestampType(), True), StructField("Carrier", StringType(), True), StructField("DayOfMonth", IntegerType(), True), StructField("DayOfWeek", IntegerType(), True), StructField("DayOfYear", IntegerType(), True), StructField("DepDelay", DoubleType(), True), StructField("Dest", StringType(), True), StructField("Distance", DoubleType(), True), StructField("FlightDate", DateType(), True), StructField("FlightNum", StringType(), True), StructField("Origin", StringType(), True), StructField("Route", StringType(), True), StructField("TailNum", StringType(), True), StructField("EngineManufacturer", StringType(), True), StructField("EngineModel", StringType(), True), StructField("Manufacturer", StringType(), True), StructField("ManufacturerYear", StringType(), True), StructField("OwnerState", StringType(), True), ]) input_path = "{}/data/simple_flight_delay_features_airplanes.json".format( base_path ) features = spark.read.json(input_path, schema=schema) features.first() # # Add the hour of day of scheduled arrival/departure # from pyspark.sql.functions import hour features_with_hour = features.withColumn( "CRSDepHourOfDay", hour(features.CRSDepTime) ) features_with_hour = features_with_hour.withColumn( "CRSArrHourOfDay", hour(features.CRSArrTime) ) features_with_hour.select("CRSDepTime", "CRSDepHourOfDay", "CRSArrTime", "CRSArrHourOfDay").show() # # Check for nulls in features before using Spark ML # null_counts = [(column, features_with_hour.where(features_with_hour[column].isNull()).count()) for column in features_with_hour.columns] cols_with_nulls = filter(lambda x: x[1] > 0, null_counts) print("\nNull Value Report") print("-----------------") print(tabulate(cols_with_nulls, headers=["Column", "Nulls"])) # # Use pysmark.ml.feature.Bucketizer to bucketize ArrDelay into on-time, slightly late, very late (0, 1, 2) # from pyspark.ml.feature import Bucketizer # Setup the Bucketizer splits = [-float("inf"), -15.0, 0, 30.0, float("inf")] arrival_bucketizer = Bucketizer( splits=splits, inputCol="ArrDelay", outputCol="ArrDelayBucket" ) # Save the model arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(base_path) arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path) # Apply the model ml_bucketized_features = arrival_bucketizer.transform(features_with_hour) ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show() # # Extract features tools in with pyspark.ml.feature # from pyspark.ml.feature import StringIndexer, VectorAssembler # Turn category fields into indexes string_columns = ["Carrier", "Origin", "Dest", "Route", "TailNum"] for column in string_columns: string_indexer = StringIndexer( inputCol=column, outputCol=column + "_index" ) string_indexer_model = string_indexer.fit(ml_bucketized_features) ml_bucketized_features = string_indexer_model.transform(ml_bucketized_features) # Save the pipeline model string_indexer_output_path = "{}/models/string_indexer_model_4.0.{}.bin".format( base_path, column ) string_indexer_model.write().overwrite().save(string_indexer_output_path) # Combine continuous, numeric fields with indexes of nominal ones # ...into one feature vector numeric_columns = [ "DepDelay", "Distance", "DayOfYear", "CRSDepHourOfDay", "CRSArrHourOfDay"] index_columns = [column + "_index" for column in string_columns] vector_assembler = VectorAssembler( inputCols=numeric_columns + index_columns, outputCol="Features_vec" ) final_vectorized_features = vector_assembler.transform(ml_bucketized_features) # Save the numeric vector assembler vector_assembler_path = "{}/models/numeric_vector_assembler_5.0.bin".format(base_path) vector_assembler.write().overwrite().save(vector_assembler_path) # Drop the index columns for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # Inspect the finalized features final_vectorized_features.show() # # Cross validate, train and evaluate classifier: loop 5 times for 4 metrics # from collections import defaultdict scores = defaultdict(list) feature_importances = defaultdict(list) metric_names = ["accuracy", "weightedPrecision", "weightedRecall", "f1"] split_count = 3 for i in range(1, split_count + 1): print("\nRun {} out of {} of test/train splits in cross validation...".format( i, split_count, ) ) # Test/train split training_data, test_data = final_vectorized_features.randomSplit([0.8, 0.2]) # Instantiate and fit random forest classifier on all the data from pyspark.ml.classification import RandomForestClassifier rfc = RandomForestClassifier( featuresCol="Features_vec", labelCol="ArrDelayBucket", predictionCol="Prediction", maxBins=4896, ) model = rfc.fit(training_data) # Save the new model over the old one model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.baseline.bin".format( base_path ) model.write().overwrite().save(model_output_path) # Evaluate model using test data predictions = model.transform(test_data) # Evaluate this split's results for each metric from pyspark.ml.evaluation import MulticlassClassificationEvaluator for metric_name in metric_names: evaluator = MulticlassClassificationEvaluator( labelCol="ArrDelayBucket", predictionCol="Prediction", metricName=metric_name ) score = evaluator.evaluate(predictions) scores[metric_name].append(score) print("{} = {}".format(metric_name, score)) # # Collect feature importances # feature_names = vector_assembler.getInputCols() feature_importance_list = model.featureImportances for feature_name, feature_importance in zip(feature_names, feature_importance_list): feature_importances[feature_name].append(feature_importance) # # Evaluate average and STD of each metric and print a table # import numpy as np score_averages = defaultdict(float) # Compute the table data average_stds = [] # ha for metric_name in metric_names: metric_scores = scores[metric_name] average_accuracy = sum(metric_scores) / len(metric_scores) score_averages[metric_name] = average_accuracy std_accuracy = np.std(metric_scores) average_stds.append((metric_name, average_accuracy, std_accuracy)) # Print the table print("\nExperiment Log") print("--------------") print(tabulate(average_stds, headers=["Metric", "Average", "STD"])) # # Persist the score to a sccore log that exists between runs # import pickle # Load the score log or initialize an empty one try: score_log_filename = "{}/models/score_log.pickle".format(base_path) score_log = pickle.load(open(score_log_filename, "rb")) if not isinstance(score_log, list): score_log = [] except IOError: score_log = [] # Compute the existing score log entry score_log_entry = { metric_name: score_averages[metric_name] for metric_name in metric_names } # Compute and display the change in score for each metric try: last_log = score_log[-1] except (IndexError, TypeError, AttributeError): last_log = score_log_entry experiment_report = [] for metric_name in metric_names: run_delta = score_log_entry[metric_name] - last_log[metric_name] experiment_report.append((metric_name, run_delta)) print("\nExperiment Report") print("-----------------") print(tabulate(experiment_report, headers=["Metric", "Score"])) # Append the existing average scores to the log score_log.append(score_log_entry) # Persist the log for next run pickle.dump(score_log, open(score_log_filename, "wb")) # # Analyze and report feature importance changes # # Compute averages for each feature feature_importance_entry = defaultdict(float) for feature_name, value_list in feature_importances.items(): average_importance = sum(value_list) / len(value_list) feature_importance_entry[feature_name] = average_importance # Sort the feature importances in descending order and print import operator sorted_feature_importances = sorted( feature_importance_entry.items(), key=operator.itemgetter(1), reverse=True ) print("\nFeature Importances") print("-------------------") print(tabulate(sorted_feature_importances, headers=['Name', 'Importance'])) # # Compare this run's feature importances with the previous run's # # Load the feature importance log or initialize an empty one try: feature_log_filename = "{}/models/feature_log.pickle".format(base_path) feature_log = pickle.load(open(feature_log_filename, "rb")) if not isinstance(feature_log, list): feature_log = [] except IOError: feature_log = [] # Compute and display the change in score for each feature try: last_feature_log = feature_log[-1] except (IndexError, TypeError, AttributeError): last_feature_log = defaultdict(float) for feature_name, importance in feature_importance_entry.items(): last_feature_log[feature_name] = importance # Compute the deltas feature_deltas = {} for feature_name in feature_importances.keys(): run_delta = feature_importance_entry[feature_name] - last_feature_log[feature_name] feature_deltas[feature_name] = run_delta # Sort feature deltas, biggest change first import operator sorted_feature_deltas = sorted( feature_deltas.items(), key=operator.itemgetter(1), reverse=True ) # Display sorted feature deltas print("\nFeature Importance Delta Report") print("-------------------------------") print(tabulate(sorted_feature_deltas, headers=["Feature", "Delta"])) # Append the existing average deltas to the log feature_log.append(feature_importance_entry) # Persist the log for next run pickle.dump(feature_log, open(feature_log_filename, "wb"))
23 from pyspark.sql.functions import date_format #data_3 = data_2.withColumn("new_dates",date_format("dates","YYYY-MM-dd HH")) data_3 = data_2.withColumn("new_dates",date_format("dates","YYYY-MM-dd")) data_3.createOrReplaceTempView("data_3") data_3.take(1) [Row(Agency='NYPD', Complaint Type='Noise - Commercial', Descriptor='Loud Music/Party', Location Type='Club/Bar/Restaurant', Incident Zip='11238', City='BROOKLYN', Borough='BROOKLYN', Latitude='40.677476821236894', Longitude='-73.96893730309779', dates=datetime.datetime(2010, 3, 6, 23, 38, 30), new_dates='2010-03-06 23')] >>> from pyspark.ml.feature import StringIndexer >>> indexer = StringIndexer(inputCol = "Agency",outputCol = "Agency_onehot") >>> indexed = indexer.fit(data_3).transform(data_3) >>> indexed.take(5) # finding the unique values in a column data_3.select("Agency").distinct().show() agency_uniq = data_3.select("Agency").distinct().rdd.map(lambda r: r[0]).collect() len(agency_uniq) 29 data_3.select("Complaint Type").distinct().show() complaint_uniq = data_3.select("Complaint Type").distinct().rdd.map(lambda r: r[0]).collect() len(complaint_uniq) 279 complaint_uniq_list = ['Traffic Signal Condition', 'Cranes and Derricks', 'SAFETY', 'ELECTRIC', 'Tanning', 'DOOR/WINDOW', 'Comments', 'Noise - Helicopter', 'STRUCTURAL', 'Broken Parking Meter', 'Window Guard', 'Broken Muni Meter', 'Highway Condition', 'Street Condition', 'FLOORING/STAIRS', 'Hazardous Materials', 'DOF Literature Request', 'Vending', 'Ferry Permit', 'PAINT - PLASTER', 'Taxi Report', 'OUTSIDE BUILDING', 'Advocate-Prop Refunds/Credits', 'Drinking Water', 'UNSANITARY CONDITION', 'Public Toilet', 'Bus Stop Shelter Complaint', 'GENERAL CONSTRUCTION', 'Municipal Parking Facility', 'DOF Property - RPIE Issue', 'Mosquitoes', 'DOF Property - Reduction Issue', 'Taxi Compliment', 'Animal in a Park', 'Animal Abuse', 'Advocate-Business Tax', 'Smoking', 'Illegal Animal Kept as Pet', 'Parking Card', 'Injured Wildlife', 'Noise - House of Worship', 'AGENCY', 'DHS Advantage -Landlord/Broker', 'Asbestos/Garbage Nuisance', 'Advocate - Levy', 'MOLD', 'Sanitation Condition', 'Special Natural Area District (SNAD)', 'Home Delivered Meal Complaint', 'Illegal Parking', 'APPLIANCE', 'Building Condition', 'Noise - Residential', 'Portable Toilet', 'Illegal Animal - Sold/Kept', 'Sewer', 'Drug Activity', 'Registration and Transfers', 'Killing/Trapping Pigeon', 'DOF Parking - DMV Clearance', 'Noise Survey', 'Noise - Commercial', 'Highway Sign - Dangling', 'Water System', 'Adopt-A-Basket', 'Squeegee', 'Air Quality', 'Advocate-Co-opCondo Abatement', 'Lead', 'Street Sign - Missing', 'Home Delivered Meal - Missed Delivery', 'Lost Property', 'Health', 'SG-99', 'DRIE', 'DCA / DOH New License Application Request', 'CONSTRUCTION', 'Derelict Vehicle', 'ELEVATOR', 'OEM Literature Request', 'NONCONST', 'DOF Property - Property Value', 'LinkNYC', 'Senior Center Complaint', 'Sweeping/Missed-Inadequate', 'Utility Program', 'DOF Property - City Rebate', 'X-Ray Machine/Equipment', 'Water Maintenance', 'Advocate-Commercial Exemptions', 'Blocked Driveway', 'Beach/Pool/Sauna Complaint', 'Homeless Encampment', 'Housing - Low Income Senior', 'Bike/Roller/Skate Chronic', 'Taxi Complaint', 'Sidewalk Condition', 'HEAT/HOT WATER', 'Unspecified', 'Meals Home Delivery Required', 'HEAP Assistance', 'Litter Basket / Request', 'Lifeguard', 'Stalled Sites', 'DOF Parking - Address Update', 'Maintenance or Facility', 'Consumer Complaint', 'For Hire Vehicle Complaint', 'Water Conservation', 'Research Questions', 'HPD Literature Request', 'Illegal Tree Damage', 'CST', 'Advocate - Lien', 'DOF Parking - Tax Exemption', 'Request Xmas Tree Collection', 'Benefit Card Replacement', 'Indoor Sewage', 'Weatherization', 'Asbestos', 'Unsanitary Animal Pvt Property', 'Discipline and Suspension', 'Water Quality', 'Derelict Bicycle', 'Sweeping/Missed', 'Eviction', 'GENERAL', 'Standing Water', 'Noise - Park', 'Construction', 'Cooling Tower', 'Bus Stop Shelter Placement', 'DOR Literature Request', 'Poison Ivy', 'Missed Collection (All Materials)', 'Disorderly Youth', 'Highway Sign - Damaged', 'PAINT/PLASTER', 'Bike Rack Condition', 'Non-Residential Heat', 'Illegal Animal Sold', 'Forensic Engineering', 'Home Care Provider Complaint', 'Other Enforcement', 'Found Property', 'Homeless Person Assistance', 'Posting Advertisement', 'Legal Services Provider Complaint', 'Scaffold Safety', 'Miscellaneous Categories', 'Recycling Enforcement', 'LEAD', 'Noise', 'Home Repair', 'Elder Abuse', 'Advocate - Other', 'New Tree Request', 'Boilers', 'DOF Property - Update Account', 'Industrial Waste', 'Sweeping/Inadequate', 'DOF Property - Owner Issue', 'Tattooing', "Alzheimer's Care", 'Dead/Dying Tree', 'Forms', 'Mold', 'Collection Truck Noise', 'SNW', 'Street Light Condition', 'Plumbing', 'Calorie Labeling', 'Ferry Complaint', 'DOF Parking - Payment Issue', 'Elevator', 'Day Care', 'Building/Use', 'DOF Property - Request Copy', 'Homebound Evacuation 4', 'Trans Fat', 'Advocate-UBT', 'Bridge Condition', 'Drinking', 'Housing Options', 'Request Large Bulky Item Collection', 'Public Payphone Complaint', 'Transportation Provider Complaint', 'Summer Camp', 'PLUMBING', 'BEST/Site Safety', 'NORC Complaint', 'Case Management Agency Complaint', 'Taxpayer Advocate Inquiry', 'No Child Left Behind', 'Emergency Response Team (ERT)', 'Question', 'Animal Facility - No Permit', 'Advocate - RPIE', 'Trapping Pigeon', 'FHE', 'Standpipe - Mechanical', 'Root/Sewer/Sidewalk Condition', 'City Vehicle Placard Complaint', 'Parent Leadership', 'DHS Advantage - Third Party', 'Street Sign - Damaged', 'Investigations and Discipline (IAD)', 'Safety', 'Food Poisoning', 'Non-Emergency Police Matter', 'Unlicensed Dog', 'General Construction/Plumbing', 'Panhandling', 'Teaching/Learning/Instruction', 'HEATING', 'Street Sign - Dangling', 'DOF Parking - Request Status', 'Dead Tree', 'Damaged Tree', 'Advocate-SCRIE/DRIE', 'Select Message Type...', 'SCRIE', 'Noise - Vehicle', 'Special Projects Inspection Team (SPIT)', 'Interior Demo', 'Traffic/Illegal Parking', 'Overflowing Recycling Baskets', 'Snow', 'Rodent', 'Radioactive Material', 'Foam Ban Enforcement', 'Highway Sign - Missing', 'Unsanitary Animal Facility', 'Overflowing Litter Baskets', 'Harboring Bees/Wasps', 'Bottled Water', 'Hazardous Material', 'Illegal Fireworks', 'Unleashed Dog', 'Traffic', 'Food Establishment', 'Derelict Vehicles', 'WATER LEAK', 'Advocate-Personal Exemptions', 'Graffiti', 'VACANT APARTMENT', 'DPR Internal', 'OEM Disabled Vehicle', 'Noise - Street/Sidewalk', 'Dirty Conditions', 'Plant', 'FCST', 'Electronics Waste', 'Curb Condition', 'Violation of Park Rules', 'Tunnel Condition', 'Indoor Air Quality', 'SRDE', 'DOF Property - State Rebate', 'Bereavement Support Group', 'For Hire Vehicle Report', 'DOF Parking - Request Copy', 'Urinating in Public', 'Ferry Inquiry', 'Unsanitary Pigeon Condition', 'Vacant Lot', 'DHS Income Savings Requirement', 'General Question', 'Overgrown Tree/Branches', 'DOF Property - Payment Issue', 'Advocate-Prop Class Incorrect', 'FATF', 'Damaged or Dead Tree', 'School Maintenance', 'DHS Advantage - Tenant', 'ATF', 'Advocate-Property Value', 'Electrical', 'Pet Shop']
os.makedirs(output_dir) features = sqc.read.parquet(input_features) features = features.filter(features['cls']!='None')\ .select(['cls', 'features'])\ .cache() print features features = sqc.createDataFrame(features.map(normalizer)) print features training, valid = features.randomSplit([0.75, 0.25]) labelIndexer = StringIndexer(inputCol="cls", outputCol="label") model = labelIndexer.fit(training) training = model.transform(training).rdd.map(lambda row: LabeledPoint(row.label, row.features)) valid = model.transform(valid).rdd.map(lambda row: LabeledPoint(row.label, row.features)) print training.first() #lr = LogisticRegression() #pipeline = Pipeline(stages=[labelIndexer,lr]) # fit model = LogisticRegressionWithLBFGS.train(training, numClasses=10) #model = pipeline.fit(training)
spark = SparkSession.builder.appName('Create tf-idf').getOrCreate() data = spark.read.load(sys.argv[1]) df = data.filter((col('date') >= '1895') & (col('seq') =='1')) \ .select(year('date').alias('year'), 'id', 'text') # https://danvatterott.com/blog/2018/07/08/aggregating-sparse-and-dense-vectors-in-pyspark/ def dense_to_array(v): new_array = list([float(x) for x in v]) return new_array dense_to_array_udf = udf(dense_to_array, ArrayType(FloatType())) indexer = StringIndexer(inputCol="id", outputCol="label") tokenizer = Tokenizer(inputCol="text", outputCol="tokens") vectorizer = CountVectorizer(inputCol="tokens", outputCol="rawFeatures") idf = IDF(inputCol="rawFeatures", outputCol="vector", minDocFreq=1) pipeline = Pipeline(stages=[indexer, tokenizer, vectorizer, idf]) model = pipeline.fit(df) results = model.transform(df) \ .select(year('date').alias('year'), 'label', 'vector') \ .withColumn('vector', dense_to_array_udf('vector')) results = model.transform(df).select('year', 'label', 'vector') results.write \ .partitionBy('year') \
return (row.label,SparseVector(len(dico),vector_dict)) from pyspark.mllib.linalg import VectorUDT from pyspark.sql.types import StructType, StructField,DoubleType schema = StructType([StructField('label',DoubleType(),True),StructField('Vectors',VectorUDT(),True)]) features=dfTrainTok.map(partial(vectorize,dico=dict_broad.value)).toDF(schema) print "Features created" from pyspark.ml.feature import StringIndexer string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed') string_indexer_model = string_indexer.fit(features) featIndexed = string_indexer_model.transform(features) print "labels indexed" lr = LogisticRegression(featuresCol='Vectors', labelCol=string_indexer.getOutputCol()) from pyspark.ml.evaluation import MulticlassClassificationEvaluator evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision') lr_model = lr.fit(featIndexed) dfTestTok = tokenizer.transform(dfTest) featuresTest=dfTestTok.map(partial(vectorize,dico=dict_broad.value)).toDF(schema) testIndexed = string_indexer_model.transform(featuresTest)
train_df = spark.sql(query) train_df = train_df.withColumn('id', F.col('id') - 1) query = """ select category, text, row_number() over (order by id1) as id from test_df """ test_df = spark.sql(query) test_df = test_df.withColumn('id', F.col('id') - 1) test_df.show(5) ######################################################################################################## # Build pipeline and run indexer = StringIndexer(inputCol="category", outputCol="label") tokenizer = RegexTokenizer(pattern=u'\W+', inputCol="text", outputCol="words", toLowercase=False) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures") idf = IDF(inputCol="rawFeatures", outputCol="features") lr = LogisticRegression(maxIter=20, regParam=0.001) # Builing model pipeline pipeline = Pipeline(stages=[indexer, tokenizer, hashingTF, idf, lr]) # Train model on training set model = pipeline.fit( train_df ) #if you give new names to your indexed datasets, make sure to make adjustments here
rdd = labeledRdd.map(lambda doc: (cleanLower(doc[0]), doc[1])) print "Text is cleaned" sqlContext = SQLContext(sc) df = sqlContext.createDataFrame(rdd, ["review", "label"]) dfTrain, dfTest = df.randomSplit([0.8, 0.2]) print "Random split is done" tokenizerNoSw = tr.NLTKWordPunctTokenizer( inputCol="review", outputCol="wordsNoSw", stopwords=set(nltk.corpus.stopwords.words("english")) ) hashing_tf = HashingTF(inputCol=tokenizerNoSw.getOutputCol(), outputCol="reviews_tf") idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf") string_indexer = StringIndexer(inputCol="label", outputCol="target_indexed") dt = DecisionTreeClassifier(featuresCol=idf.getOutputCol(), labelCol=string_indexer.getOutputCol(), maxDepth=10) pipeline = Pipeline(stages=[tokenizerNoSw, hashing_tf, idf, string_indexer, dt]) # **************************************************************** # *********************CROSS VALIDATION: 80%/20%****************** # *******************Model: DecisionTreeClassifier***************** # ***************************************************************** evaluator = MulticlassClassificationEvaluator( predictionCol="prediction", labelCol="target_indexed", metricName="precision" ) grid = ParamGridBuilder().baseOn([evaluator.metricName, "precision"]).addGrid(dt.maxDepth, [10, 20]).build()
strat_train_df = train_df.stat.sampleBy('delayed', fractions, seed=rnd_seed) strat_train_df.groupBy("delayed").count().show() # count of delayed=0.0 count_not_delayed = strat_train_df.groupBy("delayed").count().where("delayed = 0.0").select(["count"]).first()[0] # count of delayed=1.0 count_delayed = strat_train_df.groupBy("delayed").count().where("delayed = 1.0").select(["count"]).first()[0] total = count_not_delayed + count_delayed print("Not Delayed: {0}%, Delayed: {1}%".format(np.round(100 * float(count_not_delayed) / total, 2), np.round(100 * float(count_delayed) / total, 2))) colName ="carrier" carrierIndexer = StringIndexer(inputCol=colName, outputCol="{0}_indexed".format(colName)).fit(strat_train_df) indexed_df = carrierIndexer.transform(strat_train_df) # create a new "carrier_indexed" column (indexed_df.select(["origin", "dest", "carrier", "carrier_indexed"]).sample(fraction=0.001, withReplacement=False, seed=rnd_seed).show()) # check the encoded carrier values carrierIndexer.labels # check the carrier code and index mapping
pandas_df['dayofweek'] = pandas_df['Dates'].dt.dayofweek pandas_df['week'] = pandas_df['Dates'].dt.weekofyear pandas_df['x_sim'] = pandas_df['X'].str[1:8] pandas_df['X'] = pandas_df['X'].str[1:8] pandas_df['y_sim'] = pandas_df['Y'].str[0:6] pandas_df['X'] = pd.to_numeric(pandas_df['X']) pandas_df['Y'] = pd.to_numeric(pandas_df['Y']) pandas_df['x_sim'] = pd.to_numeric(pandas_df['x_sim']) pandas_df['y_sim'] = pd.to_numeric(pandas_df['y_sim']) #send back to the RDD data_df = sqlContext.createDataFrame(pandas_df) #encode the police dept as a feature stringIndexer = StringIndexer(inputCol="PdDistrict", outputCol="PdDistrict_Index") model = stringIndexer.fit(data_df) indexed = model.transform(data_df) encoder = OneHotEncoder(dropLast=False, inputCol="PdDistrict_Index", outputCol="pd") encoded = encoder.transform(indexed) #remove data_df from memory data_df.unpersist() #encode the dependent variable - category_predict classifyIndexer = StringIndexer(inputCol="Category", outputCol="Category_Index") classifymodel = classifyIndexer.fit(encoded) encoded2 = classifymodel.transform(encoded)
dataset = dataset.withColumn( "classWeights", when(dataset.default_payment_next_month == 1, BalancingRatio).otherwise(1 - BalancingRatio)) dataset.select("classWeights").show(5) display(dataset.show(3)) cols = dataset.columns categoricalColumns = ["SEX", "EDUCATION", "MARRIAGE"] stages = [] # stages in our Pipeline for categoricalCol in categoricalColumns: # Category Indexing with StringIndexer stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "Index") # Use OneHotEncoder to convert categorical variables into binary SparseVectors # encoder = OneHotEncoderEstimator(inputCol=categoricalCol + "Index", outputCol=categoricalCol + "classVec") encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"]) # Add stages. These are not run here, but will run all at once later on. stages += [stringIndexer, encoder] # Convert label into label indices using the StringIndexer label_stringIdx = StringIndexer(inputCol="default_payment_next_month", outputCol="label") stages += [label_stringIdx] # Transform all features into a vector using VectorAssembler numericCols = [ "LIMIT_BAL", "AGE", "PAY_0", "PAY_2", "PAY_3", "PAY_3", "PAY_4", "PAY_5",
StructField("device_conn_type", StringType(), True), StructField("C14", DoubleType(), True), StructField("C15", DoubleType(), True), StructField("C16", DoubleType(), True), StructField("C17", DoubleType(), True), StructField("C18", DoubleType(), True), StructField("C19", DoubleType(), True), StructField("C20", DoubleType(), True), StructField("C21", DoubleType(), True) ]) from pyspark.ml.feature import StringIndexer ## Index labels, adding metadata to the label column. ## Fit on whole dataset to include all labels in index. data = StringIndexer(inputCol="click", outputCol="label").fit(data).transform(data) data.show() ## 可產生另一個檔案.transform(data)不一定要在(data)檔案裡 #labelIndexer ===> data # RFormula from pyspark.ml.feature import RFormula ## RFormula: string input colums will be one-hot encoded, and numeric columns will be cast to doubles. ##特徵值要被修正formula" " formula = RFormula( formula="label ~ banner_pos + app_id + site_category + site_id + site_domain + device_type + device_conn_type", #formula="label ~ banner_pos + app_id + site_category + site_id + site_domain + C14 + C17 + C18 + C19 + C21", #0.707636 #formula="label ~ banner_pos + site_id + site_domain + C14 + C17 + C21", #0.7 featuresCol="features", labelCol="label")
def analyze(spark): # loading input files - pre-processed, load all csv file # path = "../data/pre-processed/*.csv" # allcsv = glob.glob(path) # input_file = allcsv # path = "preprocessed_data.csv" # allcsv = glob.glob(path) # input_file = allcsv # create spark session # spark = SparkSession.builder.appName("TestRBM").getOrCreate() # spark.sparkContext.setCheckpointDir("checkpoint/") # spark.sparkContext.setLogLevel("WARN") # # # read input files # df = spark.read \ # .option("header", "true") \ # .option("treatEmptyValuesAsNulls", "true") \ # .option("inferSchema", "true") \ # .option("charset", "UTF-8") \ # .csv(input_file) # df = df.select("MASV1", "F_MAMH", "F_MAKH", "TKET") # df = df.filter(df["F_MAKH"] == "MT") # # print(df.count()) # df = df.withColumn("MASV1", df["MASV1"].cast(DoubleType())) # df = df.withColumn("MASV1", df["MASV1"].cast(LongType())) # df = df.withColumn("TKET", df["TKET"].cast(DoubleType())) # df.show() # print(df.rdd.getNumPartitions()) # can tach train va test theo MASV spark.sparkContext.setCheckpointDir("hdfs://node3:54311/") # spark.sparkContext.setLogLevel("INFO") print("#####################Split train test######################") # train_df, test_df = get_train_test(df, spark) # test_input_output_df = test_df.randomSplit([0.8, 0.2]) #0 la input, 1 la cai output de minh join voi output cua rbm xem ket qua ok ko # train_df.coalesce(1).write.csv('train_df.csv') # test_input_output_df[0].coalesce(1).write.csv("test_input_df.csv") # test_input_output_df[1].coalesce(1).write.csv("test_output_df.csv") # train_df.toPandas().to_csv('train_df1.csv') # test_input_output_df[0].toPandas().to_csv("test_input_df1.csv") # test_input_output_df[1].toPandas().to_csv("test_output_df1.csv") train_df = load_csv_file("data/train_df1.csv", spark) test_input_output_df = [ load_csv_file("data/test_input_df1.csv", spark), load_csv_file("data/test_output_df1.csv", spark) ] # train_df.show() # preprocess input # TKET to int (double score) print( "#####################Double Score To Index SoftMax######################" ) train_input_rbm_df = train_df.withColumn("TKET", round(col("TKET") * 2).cast(LongType()))\ .drop("F_MAKH") test_input_rbm_df = test_input_output_df[0].withColumn("TKET", round(col("TKET") * 2).cast(LongType()))\ .drop("F_MAKH") # print(train_input_rbm_df.count()) # print(train_input_rbm_df.select("MASV1", "F_MAMH", "TKET").distinct().count()) # print(train_input_rbm_df.select("MASV1", "F_MAMH").distinct().count()) train_input_rbm_df = train_input_rbm_df.groupBy("MASV1", "F_MAMH").agg(collect_list("TKET").alias("list_TKET"))\ .withColumn("TKET", col("list_TKET")[0]) # print(train_input_rbm_df.count()) # print(train_input_rbm_df.select("MASV1", "F_MAMH", "TKET").distinct().count()) # print(train_input_rbm_df.select("MASV1", "F_MAMH").distinct().count()) # print(test_input_rbm_df.count()) # print(test_input_rbm_df.select("MASV1", "F_MAMH", "TKET").distinct().count()) # print(test_input_rbm_df.select("MASV1", "F_MAMH").distinct().count()) test_input_rbm_df = test_input_rbm_df.groupBy("MASV1", "F_MAMH").agg(collect_list("TKET").alias("list_TKET"))\ .withColumn("TKET", col("list_TKET")[0]) # print(test_input_rbm_df.count()) # print(test_input_rbm_df.select("MASV1", "F_MAMH", "TKET").distinct().count()) # print(test_input_rbm_df.select("MASV1", "F_MAMH").distinct().count()) # train_input_rbm_df = train_input_rbm_df.withColumn("SoftmaxIndex", col("TKET").cast(LongType()))\ # .withColumn("Active", lit(1)) # train_input_rbm_df.show() # train_input_rbm_df.cache() # #to softmax print("#####################To Binary SoftMax######################") value_to_binary_softmax_model = ValueToBinarySoftMaxModel(spark)\ .setItemCol("F_MAMH")\ .setOutputCol("Active")\ .setSoftMaxUnit(21)\ .setValueCol("TKET") train_input_rbm_df = value_to_binary_softmax_model.transform( train_input_rbm_df) train_input_rbm_df.printSchema() train_input_rbm_df.show() test_input_rbm_df = value_to_binary_softmax_model.transform( test_input_rbm_df) test_input_rbm_df.show() item_df = train_input_rbm_df.select("F_MAMH").distinct() number_of_item = item_df.count() print("Number of item:" + str(number_of_item)) item_indexer = StringIndexer().setInputCol("F_MAMH").setOutputCol( "F_MAMH_index") item_index_model = item_indexer.fit(item_df) item_index_mapping = item_index_model.transform(item_df).withColumn( "F_MAMH_index", col("F_MAMH_index").cast(LongType())).cache() train_input_rbm_df = item_index_model.transform( train_input_rbm_df).withColumn("F_MAMH_index", col("F_MAMH_index").cast(LongType())) train_input_rbm_df.printSchema() test_input_rbm_df = test_input_rbm_df.join(item_index_mapping, ["F_MAMH"]) group_user_model = GroupUserModel(spark, number_of_item)\ .setItemCol("F_MAMH_index")\ .setUserCol("MASV1")\ .setValueCol("Active") train_input_rbm_df = group_user_model.transform(train_input_rbm_df) train_input_rbm_df.printSchema() train_input_rbm_df.show() test_input_rbm_df = group_user_model.transform(test_input_rbm_df) train_input_rbm_df.printSchema() test_input_rbm_df.show() print("#####################Training phase######################") # print(train_input_rbm_df.count()) #Create RBM Model rbm_model = RBM(spark, number_of_item)\ .setUserCol("MASV1")\ .setSoftMaxUnit(21)\ .setValueCol("Active")\ .setLearningRate(0.1)\ .setNumberOfHiddenNode(30)\ .setIterNum(1)\ .setOutputCol("prediction_prob")\ .fit(train_input_rbm_df) print("#####################Predict phase######################") #transform output to expectation (Active la probability) prob_to_expect_model = ProbabilitySoftMaxToExpectationModel(spark).setUserCol("MASV1")\ .setItemCol("F_MAMH_index")\ .setValueCol("prediction_prob")\ .setOutputCol("prediction") #predict output_rbm_df = rbm_model.transform(test_input_rbm_df) output_rbm_df.show() predict_expectation_df = prob_to_expect_model.transform(output_rbm_df)\ .withColumn("prediction", col("prediction") / 2)\ .join(item_index_mapping, ["F_MAMH_index"]) predict_expectation_df.show() predict_test_df = test_input_output_df[1].join(predict_expectation_df, ["MASV1", "F_MAMH"]) predict_test_df.show() #calculate error evaluator = RegressionEvaluator(metricName="rmse", labelCol="TKET", predictionCol="prediction") rmse = evaluator.evaluate(predict_test_df) print("Root-mean-square error = " + str(rmse))
from pyspark.ml.feature import IndexToString, StringIndexer # $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("IndexToStringExample")\ .getOrCreate() # $example on$ df = spark.createDataFrame( [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")], ["id", "category"]) indexer = StringIndexer(inputCol="category", outputCol="categoryIndex") model = indexer.fit(df) indexed = model.transform(df) print("Transformed string column '%s' to indexed column '%s'" % (indexer.getInputCol(), indexer.getOutputCol())) indexed.show() print("StringIndexer will store labels in output column metadata\n") converter = IndexToString(inputCol="categoryIndex", outputCol="originalCategory") converted = converter.transform(indexed) print("Transformed indexed column '%s' back to original string column '%s' using " "labels in metadata" % (converter.getInputCol(), converter.getOutputCol())) converted.select("id", "categoryIndex", "originalCategory").show()
def oneHot(df, base_col_name, col_name): from pyspark.sql import SparkSession from pyspark import SparkContext, SparkConf from pyspark.sql import SparkSession import os import time #os.environ['SPARK_HOME'] = '/root/spark-2.1.1-bin' sparkConf = SparkConf() \ .setAppName('pyspark rentmodel') \ .setMaster('local[*]') sc = SparkContext.getOrCreate(sparkConf) sc.setLogLevel('WARN') spark = SparkSession(sparkContext=sc) df = df.select(base_col_name, col_name) df = df.filter(df[base_col_name].isNotNull()) # StringIndexer'handleInvalid of python'version no have 'keep',so it can't process null value null_col_name = col_name + '_null' df = df.na.fill(null_col_name, col_name) df_NULL = df.filter(df[col_name] == 'NULL') df = df.filter(df[col_name].isNotNull()) df = df.filter(df[col_name] != '') print('one-hot=======', col_name, df.count()) temp_path = '/data/20180621/ALL_58_beijing_save_models/' if df_NULL.count() > 0: def udf_NULL(s): return null_col_name udf_transf = udf(udf_NULL) df_NULL = df_NULL.select('*', udf_transf(col_name).alias('tmp_col_name')) df_NULL = df_NULL.na.fill(null_col_name, 'tmp_col_name') df_NULL = df_NULL.drop(col_name) df_NULL = df_NULL.withColumnRenamed('tmp_col_name', col_name) df_no_NULL = df.filter(df[col_name] != 'NULL') df_no_NULL = df_no_NULL.withColumn('tmp_col_name', df[col_name]) df_no_NULL = df_no_NULL.drop(col_name) df_no_NULL = df_no_NULL.withColumnRenamed('tmp_col_name', col_name) df = df_no_NULL.union(df_NULL) del df_no_NULL index_name = col_name + 'Index' vector_name = col_name + 'Vec' """ StringIndexer可以设置handleInvalid='skip',但是不可以设置handleInvalid='keep'. 设置这个会删除需要跳过的这一行,这样会导致用户体验差,因为用户输入 一条数据,就直接给删了,什么都没有。因此暂不设置,新数据输入时,如果没有, 可以在已经有的字符串中随机选择一个来替换没有的这个新字符串. """ stringIndexer = StringIndexer(inputCol=col_name, outputCol=index_name) model = stringIndexer.fit(df) indexed = model.transform(df) encoder = OneHotEncoder(dropLast=False, inputCol=index_name, outputCol=vector_name) encoded = encoder.transform(indexed) #save stringIndexer.save(temp_path + 'stringIndexer' + col_name) model.save(temp_path + 'stringIndexer_model' + col_name) # StringIndexer(inputCol=col_name, outputCol=index_name) # onehotEncoderPath = temp_path + col_name # loadedEncoder = OneHotEncoder.load(onehotEncoderPath) # loadedEncoder.setParams(inputCol=index_name, outputCol=vector_name) # encoded = loadedEncoder.transform(df) # encoded.show() onehotEncoderPath = temp_path + col_name + '_new' encoder.save(onehotEncoderPath) sub_encoded = encoded.select(base_col_name, vector_name) return sub_encoded
rf = GBTRegressor(maxIter=30, maxDepth=4, labelCol="indexedLabel") model = rf.fit(train) predictionAndLabels = model.transform(test).select("prediction", "indexedLabel") \ .map(lambda x: (x.prediction, x.indexedLabel)) metrics = RegressionMetrics(predictionAndLabels) print("rmse %.3f" % metrics.rootMeanSquaredError) print("r2 %.3f" % metrics.r2) print("mae %.3f" % metrics.meanAbsoluteError) if __name__ == "__main__": if len(sys.argv) > 1: print("Usage: gradient_boosted_trees", file=sys.stderr) exit(1) sc = SparkContext(appName="Jay") sqlContext = SQLContext(sc) # Load and parse the data file into a dataframe. df = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF() # Map labels into an indexed column of labels in [0, numLabels) stringIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel") si_model = stringIndexer.fit(df) td = si_model.transform(df) [train, test] = td.randomSplit([0.7, 0.3]) testClassification(train, test) testRegression(train, test) sc.stop()
cars = cars.withColumnRenamed("ncyl", "cyl") cars = cars.withColumn('length_meters', round(cars.length * 0.0254, 3)) cars = cars.withColumn('weight_kg', round(cars.weight / 2.205, 0)) cars = cars.withColumn('avg_mpg', round((cars.city_mpg + cars.hwy_mpg) / 2, 1)) \ .drop("city_mpg", "hwy_mpg") cars = cars.withColumn( 'consumption', round((100 * 3.785411784) / (cars.avg_mpg * 1.609344), 2)) pd.set_option('display.max_columns', None) # all cols pd.set_option('display.width', 161) #print(cars.toPandas().sample(8), '\n') indexer = StringIndexer(inputCol='type', outputCol='type_idx') # Assign index values to strings indexer = indexer.fit(cars) # Create column with index values cars = indexer.transform(cars) # Check column data types print('\n', cars.dtypes, '\n') kars = cars.select('name', 'weight_kg', 'cyl', 'consumption', 'type', 'type_idx') #print(kars.toPandas().sample(12)) onehot = OneHotEncoderEstimator(inputCols=['type_idx'],
# COMMAND ---------- # MAGIC %md # MAGIC In this dataset, we have ordinal variables like education (Preschool - Doctorate), and also nominal variables like relationship (Wife, Husband, Own-child, etc). For simplicity's sake, we will use One-Hot Encoding to convert all categorical variables into binary vectors. It might be possible here to improve prediction accuracy by converting each categorical column with an appropriate method. # MAGIC # MAGIC Here, we will use a combination of [StringIndexer](http://spark.apache.org/docs/latest/ml-features.html#stringindexer) and [OneHotEncoder](http://spark.apache.org/docs/latest/ml-features.html#onehotencoder) to convert the categorical variables. The OneHotEncoder will return a [SparseVector](https://spark.apache.org/docs/latest/api/python/pyspark.mllib.html#pyspark.mllib.linalg.SparseVector). # COMMAND ---------- ###One-Hot Encoding from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler categoricalColumns = ["workclass", "education", "marital_status", "occupation", "relationship", "race", "sex", "native_country"] for categoricalCol in categoricalColumns: # Category Indexing with StringIndexer stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol+"Index") model = stringIndexer.fit(dataset) indexed = model.transform(dataset) # Use OneHotEncoder to convert categorical variables into binary SparseVectors encoder = OneHotEncoder(inputCol=categoricalCol+"Index", outputCol=categoricalCol+"classVec") encoded = encoder.transform(indexed) dataset = encoded print dataset.take(1) # COMMAND ---------- # MAGIC %md # MAGIC The above code basically indexes each categorical column using the StringIndexer, and then converts the indexed categories into one-hot encoded variables. The resulting output has the binary vectors appended to the end of each row. # COMMAND ----------
import pickle import cdsw spark = SparkSession.builder \ .appName("Telco Customer Churn") \ .getOrCreate() schemaData = StructType([StructField("state", StringType(), True),StructField("account_length", DoubleType(), True),StructField("area_code", StringType(), True),StructField("phone_number", StringType(), True),StructField("intl_plan", StringType(), True),StructField("voice_mail_plan", StringType(), True),StructField("number_vmail_messages", DoubleType(), True), StructField("total_day_minutes", DoubleType(), True), StructField("total_day_calls", DoubleType(), True), StructField("total_day_charge", DoubleType(), True), StructField("total_eve_minutes", DoubleType(), True), StructField("total_eve_calls", DoubleType(), True), StructField("total_eve_charge", DoubleType(), True), StructField("total_night_minutes", DoubleType(), True), StructField("total_night_calls", DoubleType(), True), StructField("total_night_charge", DoubleType(), True), StructField("total_intl_minutes", DoubleType(), True), StructField("total_intl_calls", DoubleType(), True), StructField("total_intl_charge", DoubleType(), True), StructField("number_customer_service_calls", DoubleType(), True), StructField("churned", StringType(), True)]) churn_data = spark.read.schema(schemaData).csv('/tmp/churn.all') reduced_churn_data= churn_data.select("account_length", "number_vmail_messages", "total_day_calls", "total_day_charge", "total_eve_calls", "total_eve_charge", "total_night_calls", "total_night_charge", "total_intl_calls", "total_intl_charge","number_customer_service_calls") label_indexer = StringIndexer(inputCol = 'churned', outputCol = 'label') plan_indexer = StringIndexer(inputCol = 'intl_plan', outputCol = 'intl_plan_indexed') pipeline = Pipeline(stages=[plan_indexer, label_indexer]) indexed_data = pipeline.fit(churn_data).transform(churn_data) (train_data, test_data) = indexed_data.randomSplit([0.7, 0.3]) pdTrain = train_data.toPandas() pdTest = test_data.toPandas() features = ["intl_plan_indexed","account_length", "number_vmail_messages", "total_day_calls", "total_day_charge", "total_eve_calls", "total_eve_charge", "total_night_calls", "total_night_charge", "total_intl_calls", "total_intl_charge","number_customer_service_calls"] param_numTrees = int(sys.argv[1]) param_maxDepth = int(sys.argv[2])
# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from __future__ import print_function from pyspark import SparkContext from pyspark.sql import SQLContext # $example on$ from pyspark.ml.feature import StringIndexer # $example off$ if __name__ == "__main__": sc = SparkContext(appName="StringIndexerExample") sqlContext = SQLContext(sc) # $example on$ df = sqlContext.createDataFrame( [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")], ["id", "category"]) indexer = StringIndexer(inputCol="category", outputCol="categoryIndex") indexed = indexer.fit(df).transform(df) indexed.show() # $example off$ sc.stop()
data = spark.read.csv(base_path + file_name, header=True, inferSchema=True) data.show() data.printSchema() #print(data.columns) cols = [ 'Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked' ] data_cols = data.select(cols) data_cols.show() final_data = data_cols.na.drop() # Transform the categorical columns into numbers gender_indexer = StringIndexer(inputCol='Sex', outputCol='SexIndex') # A B C # 0 1 2 # One hot encode ----> this is mapping everyting into [1, 0, 0] [0, 1, 0] etc. gender_encoder = OneHotEncoder( inputCol='SexIndex', outputCol='SexVec' ) # ---> each entry will be converted to a vector A = [1, 0] B = [0, 1] embark_indexer = StringIndexer(inputCol='Embarked', outputCol='EmbarkedIndex') embark_encoder = OneHotEncoder( inputCol='EmbarkedIndex', outputCol='EmbarkedVec' ) # ---> each entry will be converted to a vector A = [1, 0] B = [0, 1] new_cols = ['Pclass', 'SexVec', 'Age', 'SibSp', 'Parch', 'Fare', 'EmbarkedVec'] assembler = VectorAssembler(inputCols=new_cols, outputCol='features')