def data_conversion(df):
    df = df.withColumn('Duration', (df['CONT_DOY'] - df['DISCOVERY_DOY'] + 1))
    df = df.select('LATITUDE', 'LONGITUDE', "FIRE_SIZE", "Duration")
    df = df.withColumn("FIRE_SIZE", df.FIRE_SIZE.cast(IntegerType()))
    df = df.withColumn("LATITUDE", df.LATITUDE.cast(FloatType()))
    df = df.withColumn("LONGITUDE", df.LONGITUDE.cast(FloatType()))
    df = df.withColumn("Duration", df.Duration.cast(IntegerType()))
    df = df.na.drop()
    data = df.toPandas()
    return data
def load_data():

    filename = "./data/baskteball_reference_com_teams.csv"
    spark = init_spark()

    df = spark.read.csv(filename,
                        header=True,
                        mode="DROPMALFORMED",
                        encoding='utf-8')
    df = df.select("id", "year", "team", "3P", "2P", "FT", "TRB", "AST", "STL",
                   "BLK", "TOV", "PTS", "MP", "Playoff")
    df = df.withColumn("Points_Per_minute", col("PTS") / col("MP"))
    df = df.withColumn("3Points_Per_minute", col("3P") / col("MP"))
    df = df.withColumn("2Points_Per_minute", col("2P") / col("MP"))
    df = df.withColumn("FThrow_Per_minute", col("FT") / col("MP"))
    df = df.withColumn("Rebound_Per_minute", col("TRB") / col("MP"))
    df = df.withColumn("Assists_Per_minute", col("AST") / col("MP"))
    df = df.withColumn("Steals_Per_minute", col("STL") / col("MP"))
    df = df.withColumn("Blocks_Per_minute", col("BLK") / col("MP"))
    df = df.withColumn("TurnOvers_Per_minute", col("TOV") / col("MP"))

    data_classifiers = df.select("id", "Playoff", "Points_Per_minute",
                                 "3Points_Per_minute", "2Points_Per_minute",
                                 "FThrow_Per_minute", "Rebound_Per_minute",
                                 "Assists_Per_minute", "Steals_Per_minute",
                                 "Blocks_Per_minute", "TurnOvers_Per_minute")

    return data_classifiers  #.collect()


#a= load_data()
def kmeans(sp_df):
    df = sp_df
    df = df.select('OBJECTID', 'LATITUDE', 'LONGITUDE')
    df = df.withColumn("LATITUDE", df["LATITUDE"].cast(FloatType()))
    df = df.withColumn("LONGITUDE", df["LONGITUDE"].cast(FloatType()))
    df = df.na.drop()
    X = df.toPandas()
    kmeans = KMeans(n_clusters=6, init='k-means++')
    kmeans.fit(X[X.columns[1:3]])
    X['cluster_label'] = kmeans.fit_predict(X[X.columns[1:3]])

    # Visualize the Results
    centers = kmeans.cluster_centers_
    labels = kmeans.predict(X[X.columns[1:3]])
    X.plot.scatter(x='LATITUDE', y='LONGITUDE', c=labels, s=50, cmap='viridis')
    plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=0.5)
    plt.show()
def findBestK(sp_df):
    df = sp_df

    # Preprocessing the data
    df = df.select('OBJECTID', 'LATITUDE', 'LONGITUDE')
    df = df.withColumn("LATITUDE", df["LATITUDE"].cast(FloatType()))
    df = df.withColumn("LONGITUDE", df["LONGITUDE"].cast(FloatType()))
    df = df.na.drop()

    # Finding the number of clusters is the elbow method
    K_clusters = range(3, 10)
    kmeans = [KMeans(n_clusters=i) for i in K_clusters]
    X_axis = df.select('LATITUDE').toPandas()
    Y_axis = df.select('LONGITUDE').toPandas()
    score = [kmeans[i].fit(X_axis).score(Y_axis) for i in range(len(kmeans))]

    # Visualize k value
    plt.plot(K_clusters, score)
    plt.xlabel('Number of Clusters')
    plt.ylabel('Score')
    plt.title('Elbow Curve')
    plt.show()
def preprocessing(sp_df):
    df = sp_df
    df = under_sample(df)
    #Taking the columns that are highly correlated between them
    df = df.select(df.columns[19:29] + df.columns[30:32] + df.columns[34:36])

    #Drop the null and na values from dataset
    df = df.na.drop()

    #Using the Duration day and containment day calculating the Creating Duration feature
    df = df.withColumn('Duration', (df['CONT_DOY'] - df['DISCOVERY_DOY'] + 1))
    df = df.drop("FIRE_YEAR", "DISCOVERY_DATE", "DISCOVERY_DOY",
                 "DISCOVERY_TIME", "CONT_DATE", 'CONT_TIME',
                 'STAT_CAUSE_DESCR', 'CONT_DAY', 'CONT_DOY')

    #Converting all to respective types from String DataType
    df = df.withColumn("STAT_CAUSE_CODE",
                       df.STAT_CAUSE_CODE.cast(IntegerType()))
    df = df.withColumn("FIRE_SIZE", df.FIRE_SIZE.cast(IntegerType()))
    df = df.withColumn("LATITUDE", df.LATITUDE.cast(FloatType()))
    df = df.withColumn("LONGITUDE", df.LONGITUDE.cast(FloatType()))
    df = df.withColumn("COUNTY", df.COUNTY.cast(IntegerType()))
    df = df.withColumn("Duration", df.Duration.cast(IntegerType()))
    categorical_Columns = ['STATE']
    total_indexFeatures = []

    #Converting the Categororical variable from category to one hot encoding
    for categ_col in categorical_Columns:
        catIndex = StringIndexer(inputCol=categ_col,
                                 outputCol=categ_col + 'Index')
        catEn = OneHotEncoderEstimator(inputCols=[catIndex.getOutputCol()],
                                       outputCols=[categ_col + "classVec"])
        total_indexFeatures += [catIndex, catEn]

    #Creating the Correlation Imgae between the features and saving
    numeric_data = df.select('STAT_CAUSE_CODE', 'LATITUDE', 'LONGITUDE',
                             'COUNTY', 'Duration').toPandas()
    corelationVariables(numeric_data)

    #Target variable that needs to be predicted
    label = StringIndexer(inputCol='FIRE_SIZE',
                          outputCol='label').setHandleInvalid("keep")
    total_indexFeatures += [label]

    #Extracting the continuos features in the dataset
    continuous_features = [
        'STAT_CAUSE_CODE', 'LATITUDE', 'LONGITUDE', 'COUNTY', 'Duration'
    ]
    return features_conversion(categorical_Columns, continuous_features,
                               total_indexFeatures, df)
def preprocessing_Fire_cause(sp_df):
    df = sp_df
    df = under_sample(df)

    causeSeries = df.groupby(df.STAT_CAUSE_DESCR).count().orderBy(
        'count', ascending=False)
    stat = causeSeries.collect()
    x = [i for i in range(len(stat))]
    description = [i[0] for i in stat]
    plt.bar(x, [i[1] for i in stat], alpha=0.5)
    plt.xticks(x, description)
    plt.savefig('CauseOfFire.png')

    #Taking the columns that are highly correlated between them
    df = df.select(df.columns[19:20] + df.columns[21:22] + df.columns[26:27] +
                   df.columns[23:24] + df.columns[29:32])

    #Drop the null and na values from dataset
    df = df.na.drop()

    #Converting all to respective types from String DataType
    df = df.withColumn('Duration', (df['CONT_DOY'] - df['DISCOVERY_DOY'] + 1))
    df = df.withColumn("STAT_CAUSE_CODE",
                       df.STAT_CAUSE_CODE.cast(IntegerType()))
    df = df.withColumn("FIRE_YEAR", df.FIRE_YEAR.cast(IntegerType()))
    df = df.withColumn("LATITUDE", df.LATITUDE.cast(FloatType()))
    df = df.withColumn("LONGITUDE", df.LONGITUDE.cast(FloatType()))
    df = df.withColumn("Duration", df.Duration.cast(IntegerType()))
    categorical_Columns = ['FIRE_SIZE_CLASS']
    total_indexFeatures = []

    #Converting the Categororical variable from category to one hot encoding
    for categ_col in categorical_Columns:
        catIndex = StringIndexer(inputCol=categ_col,
                                 outputCol=categ_col + 'Index')
        catEn = OneHotEncoderEstimator(inputCols=[catIndex.getOutputCol()],
                                       outputCols=[categ_col + "classVec"])
        total_indexFeatures += [catIndex, catEn]

    #Creating the Correlation Imgae between the features and saving
    numeric_data = df.select('LATITUDE', 'LONGITUDE', 'STAT_CAUSE_CODE',
                             'FIRE_YEAR', 'Duration').toPandas()
    corelationVariables(numeric_data)

    #Target variable that needs to be predicted
    label = StringIndexer(inputCol='STAT_CAUSE_CODE',
                          outputCol='label').setHandleInvalid("keep")
    total_indexFeatures += [label]

    #Extracting the continuos features in the dataset
    continuous_features = ['LATITUDE', 'LONGITUDE']

    #combining continuos and category variables
    return features_conversion(categorical_Columns, continuous_features,
                               total_indexFeatures, df)