def load_data(): filename = "./data/baskteball_reference_com_teams.csv" spark = init_spark() df = spark.read.csv(filename, header=True, mode="DROPMALFORMED", encoding='utf-8') df = df.select("id", "year", "team", "3P", "2P", "FT", "TRB", "AST", "STL", "BLK", "TOV", "PTS", "MP", "Playoff") df = df.withColumn("Points_Per_minute", col("PTS") / col("MP")) df = df.withColumn("3Points_Per_minute", col("3P") / col("MP")) df = df.withColumn("2Points_Per_minute", col("2P") / col("MP")) df = df.withColumn("FThrow_Per_minute", col("FT") / col("MP")) df = df.withColumn("Rebound_Per_minute", col("TRB") / col("MP")) df = df.withColumn("Assists_Per_minute", col("AST") / col("MP")) df = df.withColumn("Steals_Per_minute", col("STL") / col("MP")) df = df.withColumn("Blocks_Per_minute", col("BLK") / col("MP")) df = df.withColumn("TurnOvers_Per_minute", col("TOV") / col("MP")) data_classifiers = df.select("id", "Playoff", "Points_Per_minute", "3Points_Per_minute", "2Points_Per_minute", "FThrow_Per_minute", "Rebound_Per_minute", "Assists_Per_minute", "Steals_Per_minute", "Blocks_Per_minute", "TurnOvers_Per_minute") return data_classifiers #.collect() #a= load_data()
def preprocessing_Fire_cause(sp_df): df = sp_df df = under_sample(df) causeSeries = df.groupby(df.STAT_CAUSE_DESCR).count().orderBy( 'count', ascending=False) stat = causeSeries.collect() x = [i for i in range(len(stat))] description = [i[0] for i in stat] plt.bar(x, [i[1] for i in stat], alpha=0.5) plt.xticks(x, description) plt.savefig('CauseOfFire.png') #Taking the columns that are highly correlated between them df = df.select(df.columns[19:20] + df.columns[21:22] + df.columns[26:27] + df.columns[23:24] + df.columns[29:32]) #Drop the null and na values from dataset df = df.na.drop() #Converting all to respective types from String DataType df = df.withColumn('Duration', (df['CONT_DOY'] - df['DISCOVERY_DOY'] + 1)) df = df.withColumn("STAT_CAUSE_CODE", df.STAT_CAUSE_CODE.cast(IntegerType())) df = df.withColumn("FIRE_YEAR", df.FIRE_YEAR.cast(IntegerType())) df = df.withColumn("LATITUDE", df.LATITUDE.cast(FloatType())) df = df.withColumn("LONGITUDE", df.LONGITUDE.cast(FloatType())) df = df.withColumn("Duration", df.Duration.cast(IntegerType())) categorical_Columns = ['FIRE_SIZE_CLASS'] total_indexFeatures = [] #Converting the Categororical variable from category to one hot encoding for categ_col in categorical_Columns: catIndex = StringIndexer(inputCol=categ_col, outputCol=categ_col + 'Index') catEn = OneHotEncoderEstimator(inputCols=[catIndex.getOutputCol()], outputCols=[categ_col + "classVec"]) total_indexFeatures += [catIndex, catEn] #Creating the Correlation Imgae between the features and saving numeric_data = df.select('LATITUDE', 'LONGITUDE', 'STAT_CAUSE_CODE', 'FIRE_YEAR', 'Duration').toPandas() corelationVariables(numeric_data) #Target variable that needs to be predicted label = StringIndexer(inputCol='STAT_CAUSE_CODE', outputCol='label').setHandleInvalid("keep") total_indexFeatures += [label] #Extracting the continuos features in the dataset continuous_features = ['LATITUDE', 'LONGITUDE'] #combining continuos and category variables return features_conversion(categorical_Columns, continuous_features, total_indexFeatures, df)
def preprocessing(sp_df): df = sp_df df = under_sample(df) #Taking the columns that are highly correlated between them df = df.select(df.columns[19:29] + df.columns[30:32] + df.columns[34:36]) #Drop the null and na values from dataset df = df.na.drop() #Using the Duration day and containment day calculating the Creating Duration feature df = df.withColumn('Duration', (df['CONT_DOY'] - df['DISCOVERY_DOY'] + 1)) df = df.drop("FIRE_YEAR", "DISCOVERY_DATE", "DISCOVERY_DOY", "DISCOVERY_TIME", "CONT_DATE", 'CONT_TIME', 'STAT_CAUSE_DESCR', 'CONT_DAY', 'CONT_DOY') #Converting all to respective types from String DataType df = df.withColumn("STAT_CAUSE_CODE", df.STAT_CAUSE_CODE.cast(IntegerType())) df = df.withColumn("FIRE_SIZE", df.FIRE_SIZE.cast(IntegerType())) df = df.withColumn("LATITUDE", df.LATITUDE.cast(FloatType())) df = df.withColumn("LONGITUDE", df.LONGITUDE.cast(FloatType())) df = df.withColumn("COUNTY", df.COUNTY.cast(IntegerType())) df = df.withColumn("Duration", df.Duration.cast(IntegerType())) categorical_Columns = ['STATE'] total_indexFeatures = [] #Converting the Categororical variable from category to one hot encoding for categ_col in categorical_Columns: catIndex = StringIndexer(inputCol=categ_col, outputCol=categ_col + 'Index') catEn = OneHotEncoderEstimator(inputCols=[catIndex.getOutputCol()], outputCols=[categ_col + "classVec"]) total_indexFeatures += [catIndex, catEn] #Creating the Correlation Imgae between the features and saving numeric_data = df.select('STAT_CAUSE_CODE', 'LATITUDE', 'LONGITUDE', 'COUNTY', 'Duration').toPandas() corelationVariables(numeric_data) #Target variable that needs to be predicted label = StringIndexer(inputCol='FIRE_SIZE', outputCol='label').setHandleInvalid("keep") total_indexFeatures += [label] #Extracting the continuos features in the dataset continuous_features = [ 'STAT_CAUSE_CODE', 'LATITUDE', 'LONGITUDE', 'COUNTY', 'Duration' ] return features_conversion(categorical_Columns, continuous_features, total_indexFeatures, df)
def under_sample(df): window = Window.partitionBy(df['OBJECTID']).orderBy( df['STAT_CAUSE_CODE'].desc()) df = df.select( '*', rank().over(window).alias('rank')).filter(col('rank') <= 150000) return df
def frequent_parks_count_df(filename): ''' Write a Python script using DataFrames that prints the list of the 10 parks with the highest number of treated trees. Parks must be ordered by decreasing number of treated trees and by alphabetical order when they have similar number. Every list element must be printed on a new line. Test file: tests/test_frequent_parks_count_df.py Note: The return value should be a CSV string. Have a look at the file *tests/frequent.txt* to get the exact return format. ''' spark = init_spark() # ADD YOUR CODE HERE df = spark.read.csv(filename, header=True) df = df.select('Nom_parc').filter(df.Nom_parc != '') df = df.sort('Nom_parc', ascending=True) df = df.groupBy('Nom_parc').count() df = df.sort('count', ascending=False) df = df.limit(10) output = "" output = toCSVLine(df) return output raise Exception("Not implemented yet")
def data_conversion(df): df = df.withColumn('Duration', (df['CONT_DOY'] - df['DISCOVERY_DOY'] + 1)) df = df.select('LATITUDE', 'LONGITUDE', "FIRE_SIZE", "Duration") df = df.withColumn("FIRE_SIZE", df.FIRE_SIZE.cast(IntegerType())) df = df.withColumn("LATITUDE", df.LATITUDE.cast(FloatType())) df = df.withColumn("LONGITUDE", df.LONGITUDE.cast(FloatType())) df = df.withColumn("Duration", df.Duration.cast(IntegerType())) df = df.na.drop() data = df.toPandas() return data
def findBestK(sp_df): df = sp_df # Preprocessing the data df = df.select('OBJECTID', 'LATITUDE', 'LONGITUDE') df = df.withColumn("LATITUDE", df["LATITUDE"].cast(FloatType())) df = df.withColumn("LONGITUDE", df["LONGITUDE"].cast(FloatType())) df = df.na.drop() # Finding the number of clusters is the elbow method K_clusters = range(3, 10) kmeans = [KMeans(n_clusters=i) for i in K_clusters] X_axis = df.select('LATITUDE').toPandas() Y_axis = df.select('LONGITUDE').toPandas() score = [kmeans[i].fit(X_axis).score(Y_axis) for i in range(len(kmeans))] # Visualize k value plt.plot(K_clusters, score) plt.xlabel('Number of Clusters') plt.ylabel('Score') plt.title('Elbow Curve') plt.show()
def parks_df(filename): ''' Write a Python script using DataFrames that prints the number of trees that are *located in a park*. To get the park location information, have a look at the *Nom_parc* column (name of park). Test file: tests/test_parks_df.py Note: The return value should be an integer ''' spark = init_spark() # ADD YOUR CODE HERE df = spark.read.csv(filename, header=True) df = df.select('Nom_parc').filter(df.Nom_parc != '') return df.count() raise Exception("Not implemented yet")
def kmeans(sp_df): df = sp_df df = df.select('OBJECTID', 'LATITUDE', 'LONGITUDE') df = df.withColumn("LATITUDE", df["LATITUDE"].cast(FloatType())) df = df.withColumn("LONGITUDE", df["LONGITUDE"].cast(FloatType())) df = df.na.drop() X = df.toPandas() kmeans = KMeans(n_clusters=6, init='k-means++') kmeans.fit(X[X.columns[1:3]]) X['cluster_label'] = kmeans.fit_predict(X[X.columns[1:3]]) # Visualize the Results centers = kmeans.cluster_centers_ labels = kmeans.predict(X[X.columns[1:3]]) X.plot.scatter(x='LATITUDE', y='LONGITUDE', c=labels, s=50, cmap='viridis') plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=0.5) plt.show()
def features_conversion(categorical_Columns, continuous_features, total_indexFeatures, df): cols = df.columns #combining continuos and category variables totalFeatures = [c + "classVec" for c in categorical_Columns] + continuous_features tFeatures = VectorAssembler(inputCols=totalFeatures, outputCol="features") total_indexFeatures += [tFeatures] #Creating the pipeling for the transform pipeline = Pipeline(stages=total_indexFeatures) df = df.na.drop() pipelineModel = pipeline.fit(df) df = pipelineModel.transform(df) #Only selected columns are used from the dataset selectedCols = ['label', 'features'] + cols df = df.select(selectedCols) return df
def uniq_parks_df(filename): ''' Write a Python script using DataFrames that prints the list of unique parks where trees were treated. The list must be ordered alphabetically. Every element in the list must be printed on a new line. Test file: tests/test_uniq_parks_df.py Note: The return value should be a CSV string ''' spark = init_spark() # ADD YOUR CODE HERE df = spark.read.csv(filename, header=True) df = df.select('Nom_parc').filter(df.Nom_parc != '').distinct() df = df.sort('Nom_parc', ascending=True) output = "" output = toCSVLine(df) return output raise Exception("Not implemented yet")
def uniq_parks_counts_df(filename): ''' Write a Python script using DataFrames that counts the number of trees treated in each park and prints a list of "park,count" pairs in a CSV manner ordered alphabetically by the park name. Every element in the list must be printed on a new line. Test file: tests/test_uniq_parks_counts_df.py Note: The return value should be a CSV string Have a look at the file *tests/list_parks_count.txt* to get the exact return format. ''' spark = init_spark() # ADD YOUR CODE HERE df = spark.read.csv(filename, header=True) df = df.select('Nom_parc').filter(df.Nom_parc != '') df = df.sort('Nom_parc', ascending=True) df = df.groupBy('Nom_parc').count() output = "" output = toCSVLine(df) return output raise Exception("Not implemented yet")