def data_conversion(df): df = df.withColumn('Duration', (df['CONT_DOY'] - df['DISCOVERY_DOY'] + 1)) df = df.select('LATITUDE', 'LONGITUDE', "FIRE_SIZE", "Duration") df = df.withColumn("FIRE_SIZE", df.FIRE_SIZE.cast(IntegerType())) df = df.withColumn("LATITUDE", df.LATITUDE.cast(FloatType())) df = df.withColumn("LONGITUDE", df.LONGITUDE.cast(FloatType())) df = df.withColumn("Duration", df.Duration.cast(IntegerType())) df = df.na.drop() data = df.toPandas() return data
def load_data(): filename = "./data/baskteball_reference_com_teams.csv" spark = init_spark() df = spark.read.csv(filename, header=True, mode="DROPMALFORMED", encoding='utf-8') df = df.select("id", "year", "team", "3P", "2P", "FT", "TRB", "AST", "STL", "BLK", "TOV", "PTS", "MP", "Playoff") df = df.withColumn("Points_Per_minute", col("PTS") / col("MP")) df = df.withColumn("3Points_Per_minute", col("3P") / col("MP")) df = df.withColumn("2Points_Per_minute", col("2P") / col("MP")) df = df.withColumn("FThrow_Per_minute", col("FT") / col("MP")) df = df.withColumn("Rebound_Per_minute", col("TRB") / col("MP")) df = df.withColumn("Assists_Per_minute", col("AST") / col("MP")) df = df.withColumn("Steals_Per_minute", col("STL") / col("MP")) df = df.withColumn("Blocks_Per_minute", col("BLK") / col("MP")) df = df.withColumn("TurnOvers_Per_minute", col("TOV") / col("MP")) data_classifiers = df.select("id", "Playoff", "Points_Per_minute", "3Points_Per_minute", "2Points_Per_minute", "FThrow_Per_minute", "Rebound_Per_minute", "Assists_Per_minute", "Steals_Per_minute", "Blocks_Per_minute", "TurnOvers_Per_minute") return data_classifiers #.collect() #a= load_data()
def kmeans(sp_df): df = sp_df df = df.select('OBJECTID', 'LATITUDE', 'LONGITUDE') df = df.withColumn("LATITUDE", df["LATITUDE"].cast(FloatType())) df = df.withColumn("LONGITUDE", df["LONGITUDE"].cast(FloatType())) df = df.na.drop() X = df.toPandas() kmeans = KMeans(n_clusters=6, init='k-means++') kmeans.fit(X[X.columns[1:3]]) X['cluster_label'] = kmeans.fit_predict(X[X.columns[1:3]]) # Visualize the Results centers = kmeans.cluster_centers_ labels = kmeans.predict(X[X.columns[1:3]]) X.plot.scatter(x='LATITUDE', y='LONGITUDE', c=labels, s=50, cmap='viridis') plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=0.5) plt.show()
def findBestK(sp_df): df = sp_df # Preprocessing the data df = df.select('OBJECTID', 'LATITUDE', 'LONGITUDE') df = df.withColumn("LATITUDE", df["LATITUDE"].cast(FloatType())) df = df.withColumn("LONGITUDE", df["LONGITUDE"].cast(FloatType())) df = df.na.drop() # Finding the number of clusters is the elbow method K_clusters = range(3, 10) kmeans = [KMeans(n_clusters=i) for i in K_clusters] X_axis = df.select('LATITUDE').toPandas() Y_axis = df.select('LONGITUDE').toPandas() score = [kmeans[i].fit(X_axis).score(Y_axis) for i in range(len(kmeans))] # Visualize k value plt.plot(K_clusters, score) plt.xlabel('Number of Clusters') plt.ylabel('Score') plt.title('Elbow Curve') plt.show()
def preprocessing(sp_df): df = sp_df df = under_sample(df) #Taking the columns that are highly correlated between them df = df.select(df.columns[19:29] + df.columns[30:32] + df.columns[34:36]) #Drop the null and na values from dataset df = df.na.drop() #Using the Duration day and containment day calculating the Creating Duration feature df = df.withColumn('Duration', (df['CONT_DOY'] - df['DISCOVERY_DOY'] + 1)) df = df.drop("FIRE_YEAR", "DISCOVERY_DATE", "DISCOVERY_DOY", "DISCOVERY_TIME", "CONT_DATE", 'CONT_TIME', 'STAT_CAUSE_DESCR', 'CONT_DAY', 'CONT_DOY') #Converting all to respective types from String DataType df = df.withColumn("STAT_CAUSE_CODE", df.STAT_CAUSE_CODE.cast(IntegerType())) df = df.withColumn("FIRE_SIZE", df.FIRE_SIZE.cast(IntegerType())) df = df.withColumn("LATITUDE", df.LATITUDE.cast(FloatType())) df = df.withColumn("LONGITUDE", df.LONGITUDE.cast(FloatType())) df = df.withColumn("COUNTY", df.COUNTY.cast(IntegerType())) df = df.withColumn("Duration", df.Duration.cast(IntegerType())) categorical_Columns = ['STATE'] total_indexFeatures = [] #Converting the Categororical variable from category to one hot encoding for categ_col in categorical_Columns: catIndex = StringIndexer(inputCol=categ_col, outputCol=categ_col + 'Index') catEn = OneHotEncoderEstimator(inputCols=[catIndex.getOutputCol()], outputCols=[categ_col + "classVec"]) total_indexFeatures += [catIndex, catEn] #Creating the Correlation Imgae between the features and saving numeric_data = df.select('STAT_CAUSE_CODE', 'LATITUDE', 'LONGITUDE', 'COUNTY', 'Duration').toPandas() corelationVariables(numeric_data) #Target variable that needs to be predicted label = StringIndexer(inputCol='FIRE_SIZE', outputCol='label').setHandleInvalid("keep") total_indexFeatures += [label] #Extracting the continuos features in the dataset continuous_features = [ 'STAT_CAUSE_CODE', 'LATITUDE', 'LONGITUDE', 'COUNTY', 'Duration' ] return features_conversion(categorical_Columns, continuous_features, total_indexFeatures, df)
def preprocessing_Fire_cause(sp_df): df = sp_df df = under_sample(df) causeSeries = df.groupby(df.STAT_CAUSE_DESCR).count().orderBy( 'count', ascending=False) stat = causeSeries.collect() x = [i for i in range(len(stat))] description = [i[0] for i in stat] plt.bar(x, [i[1] for i in stat], alpha=0.5) plt.xticks(x, description) plt.savefig('CauseOfFire.png') #Taking the columns that are highly correlated between them df = df.select(df.columns[19:20] + df.columns[21:22] + df.columns[26:27] + df.columns[23:24] + df.columns[29:32]) #Drop the null and na values from dataset df = df.na.drop() #Converting all to respective types from String DataType df = df.withColumn('Duration', (df['CONT_DOY'] - df['DISCOVERY_DOY'] + 1)) df = df.withColumn("STAT_CAUSE_CODE", df.STAT_CAUSE_CODE.cast(IntegerType())) df = df.withColumn("FIRE_YEAR", df.FIRE_YEAR.cast(IntegerType())) df = df.withColumn("LATITUDE", df.LATITUDE.cast(FloatType())) df = df.withColumn("LONGITUDE", df.LONGITUDE.cast(FloatType())) df = df.withColumn("Duration", df.Duration.cast(IntegerType())) categorical_Columns = ['FIRE_SIZE_CLASS'] total_indexFeatures = [] #Converting the Categororical variable from category to one hot encoding for categ_col in categorical_Columns: catIndex = StringIndexer(inputCol=categ_col, outputCol=categ_col + 'Index') catEn = OneHotEncoderEstimator(inputCols=[catIndex.getOutputCol()], outputCols=[categ_col + "classVec"]) total_indexFeatures += [catIndex, catEn] #Creating the Correlation Imgae between the features and saving numeric_data = df.select('LATITUDE', 'LONGITUDE', 'STAT_CAUSE_CODE', 'FIRE_YEAR', 'Duration').toPandas() corelationVariables(numeric_data) #Target variable that needs to be predicted label = StringIndexer(inputCol='STAT_CAUSE_CODE', outputCol='label').setHandleInvalid("keep") total_indexFeatures += [label] #Extracting the continuos features in the dataset continuous_features = ['LATITUDE', 'LONGITUDE'] #combining continuos and category variables return features_conversion(categorical_Columns, continuous_features, total_indexFeatures, df)