def main(config): # Cookie cutter sequence of processes involved in running the # necessary steps. Using the general pipeline outlined in Spark's # MLLib docs here: https://spark.apache.org/docs/latest/ml-pipeline.html spark = spark_initiate() # some data / tramsformer raw_data = config['base']['train_df'] structure_schema = model_structure() data = load_data(spark, raw_data, 'df', structure_schema) # data.show() df, cat_dict = transformer(data) datatype_dict = dict(df.dtypes) features = config['base']['featuresCol'].split(',') list_str = [] # list of string columns for feature in features: if datatype_dict[feature] == 'string': list_str.append(feature) df = StringIndexer(inputCol=feature, outputCol=feature + '_index' ) \ .fit(df) \ .transform(df) df = df.drop(*list_str) df.show() features = list(set(df.columns) - set(config['base']['labelCol'])) assembler = VectorAssembler(inputCols=features, outputCol='features') df = assembler.transform(df) (trainingData, testData) = df.randomSplit([0.7, 0.3]) # estimator model = estimators(config) fitted_model = model.fit(trainingData) testData = fitted_model.transform(testData) predictionAndLabels = testData.select('probability','Survived') \ .rdd.map(lambda x: (float(x[0][0]), float(x[1]) ) ) metrics = BinaryClassificationMetrics(predictionAndLabels) # Area under precision-recall curve print("Area under PR = %s" % metrics.areaUnderPR) # Area under ROC curve print("Area under ROC = %s" % metrics.areaUnderROC)
df2 = spark.read.csv('E:/kaggle/titanic/train_kaggle.csv', header=True) df2.count() # --------------------------------------- df2.printSchema() df3 = df2.select('Sex', 'Pclass', 'Survived', 'Embarked') df3.show() df3.printSchema() # ------------------------------------------- from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler df3 = StringIndexer(inputCol='Embarked', outputCol='Embarked1').fit(df3).transform(df3) df3.show() df3 = OneHotEncoder(inputCol='Embarked1', outputCol='Embarked2', dropLast=False).transform(df3) df3.show() # -------------------------------------------- df3 = StringIndexer(inputCol='Sex', outputCol='Gender').fit(df3).transform(df3) df3 = OneHotEncoder(inputCol='Gender', outputCol='Gender1', dropLast=False).transform(df3) df3.show() # cast to double df3 = df3.select(df3.Pclass.cast('double'), df3.Gender1, df3.Embarked2,
coviddeath = spark.read.csv('UScasestemp1.csv', inferSchema=True, header=True) else: coviddeath = spark.sql("SELECT * FROM uscasestemp1_csv") # COMMAND ---------- data = coviddeath.select("Year", "Date", "Day", "Temp", "Lat", "Long", "Admin2", "Province", ((col("Case") > 2).cast("Double").alias("label"))) data = StringIndexer(inputCol='Admin2', outputCol='Admin2' + "_index").fit(data).transform(data) data = StringIndexer(inputCol='Province', outputCol='Province' + "_index").fit(data).transform(data) data.show(5) # COMMAND ---------- splits = data.randomSplit([0.7, 0.3]) train = splits[0] test = splits[1].withColumnRenamed("label", "trueLabel") train_rows = train.count() test_rows = test.count() print("Training Rows:", train_rows, " Testing Rows:", test_rows) # COMMAND ---------- from pyspark.ml.classification import RandomForestClassifier assembler = VectorAssembler( inputCols=["Day", "Temp", "Lat", "Province_index", "Admin2_index"],
# Create an indexer for carrier categorical feature indexer = StringIndexer(inputCol="carrier", outputCol='carrier_idx') # Indexer identifies categories in the data indexer_model = indexer.fit(flights) # Indexer creates a new column with numeric index values flights = indexer_model.transform(flights) # Repeat the process for the org categorical feature flights = StringIndexer(inputCol="org", outputCol='org_idx').fit(flights).transform(flights) # Check first five records flights.show(5) # Create an assembler object assembler = VectorAssembler(inputCols=[ "mon", "dom", "dow", "carrier_idx", "org_idx", "km", "depart", "duration" ], outputCol='features') # Consolidate predictor columns flights_assembled = assembler.transform(flights) # Check the resulting column flights = flights_assembled.select('features', 'xdelay') # Split into training and testing sets in a 80:20 ratio flights_train, flights_test = flights.randomSplit([0.8, 0.2], seed=17)
def analyze(sc, train_path, test_path): train_rdd = sc.textFile(train_path) test_rdd = sc.textFile(test_path) train_df = parseTrain(train_rdd) test_df = parseTest(test_rdd) train_df = train_df.withColumn('Mark', lit('train')) test_df = (test_df.withColumn('Survived', lit(0)).withColumn('Mark', lit('test'))) test_df = test_df[train_df.columns] ## Append Test data to Train data df = train_df.unionAll(test_df) df = (df.withColumn('Age', df['Age'].cast('double')).withColumn( 'SibSp', df['SibSp'].cast('double')).withColumn( 'Parch', df['Parch'].cast('double')).withColumn( 'Fare', df['Fare'].cast('double')).withColumn( 'Survived', df['Survived'].cast('double'))) df.printSchema() numVars = ['Survived', 'Age', 'SibSp', 'Parch', 'Fare'] missing = {var: countNull(df, var) for var in numVars} age_mean = df.groupBy().mean('Age').first()[0] fare_mean = df.groupBy().mean('Fare').first()[0] df = df.na.fill({'Age': age_mean, 'Fare': fare_mean}) ## created user defined function to extract title getTitle = udf(lambda name: name.split('.')[0].strip(), StringType()) df = df.withColumn('Title', getTitle(df['Name'])) df.select('Name', 'Title').show(3) catVars = ['Pclass', 'Sex', 'Embarked', 'Title'] si = StringIndexer(inputCol='Sex', outputCol='Sex_indexed') df_indexed = si.fit(df).transform(df).drop('Sex').withColumnRenamed( 'Sex_indexed', 'Sex') def indexer(df, col): si = StringIndexer(inputCol=col, outputCol=col + '_indexed').fit(df) return si indexers = [indexer(df, col) for col in catVars] pipeline = Pipeline(stages=indexers) df_indexed = pipeline.fit(df).transform(df) df_indexed.select('Embarked', 'Embarked_indexed').show(10) catVarsIndexed = [i + '_indexed' for i in catVars] featuresCol = numVars + catVarsIndexed featuresCol.remove('Survived') labelCol = ['Mark', 'Survived'] row = Row('mark', 'label', 'features') df_indexed = df_indexed[labelCol + featuresCol] # 0-mark, 1-label, 2-features # map features to DenseVector lf = df_indexed.rdd.map(lambda r: (row(r[0], r[1], DenseVector(r[2:])))).toDF() # index label # convert numeric label to categorical, which is required by # decisionTree and randomForest lf = StringIndexer(inputCol='label', outputCol='index').fit(lf).transform(lf) lf.show(3) train = lf.where(lf.mark == 'train') test = lf.where(lf.mark == 'test') # random split further to get train/validate train, validate = train.randomSplit([0.7, 0.3], seed=121) print('Train Data Number of Row: ' + str(train.count())) print('Validate Data Number of Row: ' + str(validate.count())) print('Test Data Number of Row: ' + str(test.count())) lr = LogisticRegression(maxIter=100, regParam=0.05, labelCol='index').fit(train) # Evaluate model based on auc ROC(default for binary classification) def testModel(model, validate=validate): pred = model.transform(validate) evaluator = BinaryClassificationEvaluator(labelCol='index') return evaluator.evaluate(pred) print('AUC ROC of Logistic Regression model is: ' + str(testModel(lr))) dt = DecisionTreeClassifier(maxDepth=3, labelCol='index').fit(train) rf = RandomForestClassifier(numTrees=100, labelCol='index').fit(train) models = { 'LogisticRegression': lr, 'DecistionTree': dt, 'RandomForest': rf } modelPerf = {k: testModel(v) for k, v in models.iteritems()} print(modelPerf)
from pyspark.ml.evaluation import BinaryClassificationEvaluator from pyspark.ml.recommendation import ALS from pyspark.sql import Row from pyspark.ml.feature import StringIndexer df_train = spark.read.parquet("./cf_train_subsampled.parquet") df_val = spark.read.parquet("./cf_validation_subsampled.parquet") df_test = spark.read.parquet("./cf_test_subsampled.parquet") df_train.createOrReplaceTempView('df_train') df_val.createOrReplaceTempView('df_val') df_test.createOrReplaceTempView('df_test') df_train = StringIndexer(inputCol="user_id", outputCol="user_id_numeric").fit(df_train).transform(df_train) df_train = StringIndexer(inputCol="track_id", outputCol="track_id_numeric").fit(df_train).transform(df_train) df_train.show() # Build the recommendation model using ALS on the training data # Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics als = ALS(rank=10, maxIter=10, regParam=0.1, alpha=1.0, userCol="user_id_numeric", itemCol="track_id_numeric", ratingCol="count", coldStartStrategy="drop") model = als.fit(df_train) # Evaluate the model by computing the RMSE on the test data predictions = model.transform(test) evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") rmse = evaluator.evaluate(predictions) print("Root-mean-square error = " + str(rmse)) # Generate top 10 movie recommendations for each user
def main(): spark, sc = init_spark() df = (spark.read.format("csv").option( 'header', 'true').load("C:\\sparkTmp\\titanic_train.csv")) df.show(5) # How many rows we have print(df.count()) # The names of our columns print(df.columns) # Types of our columns print(df.dtypes) print(df.describe()) dataset = df.select( col("Survived").cast("float"), col("Pclass").cast("float"), col("Sex"), col("Age").cast("float"), col("Fare").cast("float"), col("Embarked"), ) dataset.show() dataset.select( [count(when(isnull(c), c)).alias(c) for c in dataset.columns]).show() dataset = dataset.dropna(how="any") dataset.select( [count(when(isnull(c), c)).alias(c) for c in dataset.columns]).show() # We need to transform Sex and Embarked to numerical value dataset = StringIndexer( inputCol="Sex", outputCol="Gender", handleInvalid="keep").fit(dataset).transform(dataset) dataset = StringIndexer( inputCol="Embarked", outputCol="Boarded", handleInvalid="keep").fit(dataset).transform(dataset) # StringIndexer transforms not just to a plain double, but preserves category print(dataset.schema.fields[7].metadata) dataset = dataset.drop("Sex") dataset = dataset.drop("Embarked") dataset.show() required_features = ["Pclass", "Age", "Fare", "Gender", "Boarded"] assembler = VectorAssembler(inputCols=required_features, outputCol='features') transformed_data = assembler.transform(dataset) transformed_data.show() (training_data, test_data) = transformed_data.randomSplit([0.8, 0.2]) rf = RandomForestClassifier(labelCol="Survived", featuresCol="features", maxDepth=5) model = rf.fit(training_data) predictions = model.transform(test_data) evaluator = MulticlassClassificationEvaluator(labelCol="Survived", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Test Accuracy = ", accuracy)
featuresCol = [c for c in df.columns if c not in {'OUTCOME'}] row = Row('unscaledFeatures', 'unlabel') # df = df[featuresCol + labelCol] # 0-features, 1-label # map features to DenseVector lf = df.map(lambda r: (row(DenseVector(r[:-1]), r[-1]))).toDF() # index label # convert numeric label to categorical, which is required by # decisionTree and randomForest lf = StringIndexer(inputCol='unlabel', outputCol='label').fit(lf).transform(lf) lf.show(3) # Double check that the data version is correct after conversion. # In[10]: lf.take(2) # <a id="context24"></a> # ### 2.4. Features Standardization # # The features are not in the same unit, in order to make future algorithms to coverge fast, we should do standardization. Here we standardize each feature to have zero mean and unit variance. # In[11]: from pyspark.ml.feature import StandardScaler
#Read "Subscriptions" data subscriptions = spark.read.format("csv").option("header","true").option("inferSchema","true").load(path + "BDT2_1920_Subscriptions.csv") # COMMAND ---------- #Subsetting Data for Train and Validation last_date = '2018-08-31' #Complaints complaints = complaints.withColumn("ComplaintDate", to_date(col("ComplaintDate"), "yyyy-MM-dd")).filter(col("ComplaintDate")<=lit(last_date)) complaints = complaints.orderBy(desc('ComplaintDate')) complaints.show(5) #Delivery delivery = delivery.withColumn("DeliveryDate", to_date(col("DeliveryDate"), "yyyy-MM-dd")).filter(col("DeliveryDate")<=lit(last_date)) delivery = delivery.orderBy(desc('DeliveryDate')) delivery.show(5) #Subscriptions subscriptions = subscriptions.withColumn("EndDate", to_date(col("EndDate"), "yyyy-MM-dd")).filter(col("EndDate")<=lit(last_date)) subscriptions = subscriptions.orderBy(desc('EndDate')) subscriptions.show(5) # COMMAND ---------- #COMPLAINTS #Check schema and first rows complaints.printSchema() #Schema is ok complaints.toPandas().head(5) #Changing column types from string to integer convert_int = ["ProductID","ComplaintTypeID","SolutionTypeID", "FeedbackTypeID"]
# read data from csv files to dataframe df2 = spark.read.csv('E:/kaggle/titanic/train_kaggle.csv', header=True) df2.count() df2.describe().show() # --------------------------------------- df2.printSchema() df3 = df2.select('Sex', 'Pclass', 'Survived', 'Embarked') df3.show() df3.printSchema() # ------------------------------------------- from pyspark.ml.feature import StringIndexer, OneHotEncoder df3 = StringIndexer(inputCol='Embarked', outputCol='Embarked1').fit(df3).transform(df3) df3.show() df3 = OneHotEncoder(inputCol='Embarked1', outputCol='Embarked2', dropLast=False).transform(df3) df3.show() # -------------------------------------------- df3 = StringIndexer(inputCol='Sex', outputCol='Gender').fit(df3).transform(df3) df3.groupBy(df3.Embarked, 'Embarked').agg({ 'Embarked': 'count', 'Embarked1': 'sum' }).show() df3.show(5)
dataset.show(5) dataset.select([count(when(isnull(c), c)).alias(c) for c in dataset.columns]).show() dataset = dataset.replace('?', None).dropna(how='any') dataset = StringIndexer(inputCol='Sex', outputCol='Gender', handleInvalid='keep').fit(dataset).transform(dataset) dataset = StringIndexer(inputCol='Embarked', outputCol='Boarded', handleInvalid='keep').fit(dataset).transform(dataset) dataset.show() dataset = dataset.drop('Sex') dataset = dataset.drop('Embarked') # Assemble all the features with VectorAssembler required_features = ['Pclass', 'Age', 'Fare', 'Gender', 'Boarded'] assembler = VectorAssembler(inputCols=required_features, outputCol='features') transformed_data = assembler.transform(dataset) (training_data, test_data) = transformed_data.randomSplit([0.8, 0.2]) rf = RandomForestClassifier(labelCol='Survived', featuresCol='features', maxDepth=5)
# In[87]: orgidxer_df= StringIndexer(inputCol='origin', outputCol='origin_idx').fit(indexer_df).transform(indexer_df) # In[90]: desidxer_df= StringIndexer(inputCol='dest', outputCol='dest_idx').fit(orgidxer_df).transform(orgidxer_df) # In[91]: desidxer_df.show(5) # In[94]: df2= desidxer_df.drop('carrier', 'origin','dest') # In[95]: df2.show(5) # In[93]:
bucketizer.setHandleInvalid("skip").transform(df).show() discretizers = [ft.QuantileDiscretizer(inputCol=c, outputCol="{}_buckets".format(c)) for c in numeric_features] # discretizers = ft.QuantileDiscretizer(numBuckets=3,inputCols=numeric_features, outputCols=["{}_buckets".format(c) for c in numeric_features]) # StringIndexer from pyspark.ml.feature import StringIndexer df = spark.createDataFrame( [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")], ["id", "category"]) indexer = StringIndexer(inputCol="category", outputCol="categoryIndex") indexer = indexer.fit(df).transform(df) indexer.show() # labelConveter from pyspark.ml import Pipeline from pyspark.ml.feature import IndexToString, StringIndexer df = spark.createDataFrame( [(0, "Yes"), (1, "Yes"), (2, "Yes"), (3, "No"), (4, "No"), (5, "No")], ["id", "label"]) indexer = StringIndexer(inputCol="label", outputCol="labelIndex") converter = IndexToString(inputCol="labelIndex", outputCol="originalLabel") pipeline = Pipeline(stages=[indexer, converter]) model = pipeline.fit(df) result = model.transform(df) result.show()
df1.count() df1.printSchema() df1.describe().show() df1.groupby('type').count().show() df1.groupby('color').count().sort('count', ascending=False).show() # -------------------------------------------------------------------------- # cast to double df2 = df1.select(df1.id.cast('double'), df1.bone_length.cast('double'), df1.rotting_flesh.cast('double'), df1.hair_length.cast('double'), df1.has_soul.cast('double'), 'color', 'type') from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler df3 = StringIndexer(inputCol='color', outputCol='color1').fit(df2).transform(df2) df3.show() df3.printSchema() df3 = OneHotEncoder(inputCol='color1', outputCol='color2', dropLast=False).transform(df3) df3.printSchema() df4 = StringIndexer(inputCol='type', outputCol='type1').fit(df2).transform(df3) df4.show() df4.printSchema() # Vector assembler df5 = VectorAssembler(inputCols=[ 'id', 'bone_length', 'rotting_flesh', 'hair_length', 'has_soul', 'color2' ], outputCol='Features').transform(df4) df5.show(truncate=False) df5.printSchema()
# Show Schema details of the spark Dataframe df2.printSchema() # creating new spark data frame based on the selected columns df3 = df2.select('Pclass', 'Survived', 'Sex', 'Embarked') df3.show(n=5) df3.printSchema() # Aggregation on Specific cloumn df3.groupBy('Embarked').agg({'Embarked': 'count'}).show() # show summary of cloumns df3.describe().show() # Applying OneHot on Embared Column # Step 1 : Convert the column to set of numbers using String Indexer. Hear each category will be assigned with a number based on its occurences (i.e Mode) df3 = StringIndexer(inputCol='Embarked', outputCol='EntryPoints').fit(df3).transform(df3) df3.show(n=15) # Step 2: Apply OneHot on Newly created StringIndexer column df3 = OneHotEncoder(inputCol='EntryPoints', outputCol='EntryLocations', dropLast=False).transform(df3) df3.show(15) # comibine all feature columns in to one column to pass it to the model df3 = VectorAssembler(inputCols=[''])
StructField("C14", DoubleType(), False), StructField("C15", DoubleType(), False), StructField("C16", DoubleType(), False), StructField("C17", DoubleType(), False), StructField("C18", DoubleType(), False), StructField("C19", DoubleType(), False), StructField("C20", DoubleType(), False), StructField("C21", DoubleType(), False)]) # Get file df = sqlContext.read.format("com.databricks.spark.csv").options(header= 'true').schema(customSchema).load("file:///home/bigdatas16/Downloads/train100K.csv") # Displays the content of the DataFrame to stdout df.show() from pyspark.ml.feature import StringIndexer data = StringIndexer(inputCol="click", outputCol="label").fit(df).transform(df) data.show() # RFormula from pyspark.ml.feature import RFormula formula = RFormula(formula="label ~ banner_pos + app_id + site_category + site_id + site_domain + device_model + C14 + C17 + C18 + C19 + C21 ", featuresCol="features", labelCol="label") output = formula.fit(data).transform(data) data1 = output.select("label", "features") data1.show() # Split training and test data. #(training, test) = data1.randomSplit([0.7, 0.3], seed = 12345) training, test = data1.randomSplit([0.7, 0.3], seed = 12345) training.show() # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and rf (random forest). from pyspark.ml.classification import LogisticRegression
StructField("C14", DoubleType(), True), StructField("C15", DoubleType(), True), StructField("C16", DoubleType(), True), StructField("C17", DoubleType(), True), StructField("C18", DoubleType(), True), StructField("C19", DoubleType(), True), StructField("C20", DoubleType(), True), StructField("C21", DoubleType(), True) ]) from pyspark.ml.feature import StringIndexer ## Index labels, adding metadata to the label column. ## Fit on whole dataset to include all labels in index. data = StringIndexer(inputCol="click", outputCol="label").fit(data).transform(data) data.show() ## 可產生另一個檔案.transform(data)不一定要在(data)檔案裡 #labelIndexer ===> data # RFormula from pyspark.ml.feature import RFormula ## RFormula: string input colums will be one-hot encoded, and numeric columns will be cast to doubles. ##特徵值要被修正formula" " formula = RFormula( formula="label ~ banner_pos + app_id + site_category + site_id + site_domain + device_type + device_conn_type", #formula="label ~ banner_pos + app_id + site_category + site_id + site_domain + C14 + C17 + C18 + C19 + C21", #0.707636 #formula="label ~ banner_pos + site_id + site_domain + C14 + C17 + C21", #0.7 featuresCol="features", labelCol="label") formula_data = formula.fit(data).transform(data)
flights = flights.dropna() print("\nThe data contains %d records after dropping records with na values." % flights.count()) # Create an indexer for carrier categorical feature indexer = StringIndexer(inputCol="carrier", outputCol='carrier_idx') # Indexer identifies categories in the data indexer_model = indexer.fit(flights) # Indexer creates a new column with numeric index values flights_indexed = indexer_model.transform(flights) # Repeat the process for the org categorical feature flights_indexed = StringIndexer(inputCol="org", outputCol='org_idx').fit(flights_indexed).transform(flights_indexed) # Check first five records flights_indexed.show(5) flites = flights_indexed.select('carrier', 'org', 'org_idx') # Create an instance of the one hot encoder onehot = OneHotEncoderEstimator(inputCols=["org_idx"], outputCols=["org_dummy"]) # Apply the one hot encoder to the flights data onehot = onehot.fit(flites) flights_onehot = onehot.transform(flites) # Check the results flights_onehot.select('org', 'org_idx', 'org_dummy').distinct().sort('org_idx').show() spark.stop()
df2 = spark.read.csv('/users/jyothsnap/Kaggle/titanic/train.csv',header=True) df2.count() # --------------------------------------- df3 = df2.select('Sex','Pclass','Survived','Embarked') df3.show() df3.printSchema() from pyspark.ml.feature import StringIndexer df3 = StringIndexer(inputCol='Sex',outputCol='Gender').fit(df3).transform(df3) df3.groupby(df3.Embarked,'Embarked').agg({'Embarked':'count'}).show() df3 = StringIndexer(inputCol='Embarked',outputCol='Embarked_Transformed').fit(df3).transform(df3) #df3.groupby(df3.Embarked,'Embarked').agg({'Embarked':'count'}).show() df3.show() df3.printSchema() df3 = df3.select(df3.Pclass.cast('double'),df3.SibSp.cast('double'),df3.Survived.cast('double'),df3.Fare.cast('double')) df3.show() df3.printSchema() # Vector assembler from pyspark.ml.feature import VectorAssembler df3 = VectorAssembler(inputCols=['Pclass','SibSp','Fare'],outputCol='Features').transform(df3) df3.show() # # 1 choose approach from pyspark.ml.classification import DecisionTreeClassifier