Ejemplo n.º 1
0
def load_data():

    filename = "./data/baskteball_reference_com_teams.csv"
    spark = init_spark()

    df = spark.read.csv(filename,
                        header=True,
                        mode="DROPMALFORMED",
                        encoding='utf-8')
    df = df.select("id", "year", "team", "3P", "2P", "FT", "TRB", "AST", "STL",
                   "BLK", "TOV", "PTS", "MP", "Playoff")
    df = df.withColumn("Points_Per_minute", col("PTS") / col("MP"))
    df = df.withColumn("3Points_Per_minute", col("3P") / col("MP"))
    df = df.withColumn("2Points_Per_minute", col("2P") / col("MP"))
    df = df.withColumn("FThrow_Per_minute", col("FT") / col("MP"))
    df = df.withColumn("Rebound_Per_minute", col("TRB") / col("MP"))
    df = df.withColumn("Assists_Per_minute", col("AST") / col("MP"))
    df = df.withColumn("Steals_Per_minute", col("STL") / col("MP"))
    df = df.withColumn("Blocks_Per_minute", col("BLK") / col("MP"))
    df = df.withColumn("TurnOvers_Per_minute", col("TOV") / col("MP"))

    data_classifiers = df.select("id", "Playoff", "Points_Per_minute",
                                 "3Points_Per_minute", "2Points_Per_minute",
                                 "FThrow_Per_minute", "Rebound_Per_minute",
                                 "Assists_Per_minute", "Steals_Per_minute",
                                 "Blocks_Per_minute", "TurnOvers_Per_minute")

    return data_classifiers  #.collect()


#a= load_data()
def preprocessing_Fire_cause(sp_df):
    df = sp_df
    df = under_sample(df)

    causeSeries = df.groupby(df.STAT_CAUSE_DESCR).count().orderBy(
        'count', ascending=False)
    stat = causeSeries.collect()
    x = [i for i in range(len(stat))]
    description = [i[0] for i in stat]
    plt.bar(x, [i[1] for i in stat], alpha=0.5)
    plt.xticks(x, description)
    plt.savefig('CauseOfFire.png')

    #Taking the columns that are highly correlated between them
    df = df.select(df.columns[19:20] + df.columns[21:22] + df.columns[26:27] +
                   df.columns[23:24] + df.columns[29:32])

    #Drop the null and na values from dataset
    df = df.na.drop()

    #Converting all to respective types from String DataType
    df = df.withColumn('Duration', (df['CONT_DOY'] - df['DISCOVERY_DOY'] + 1))
    df = df.withColumn("STAT_CAUSE_CODE",
                       df.STAT_CAUSE_CODE.cast(IntegerType()))
    df = df.withColumn("FIRE_YEAR", df.FIRE_YEAR.cast(IntegerType()))
    df = df.withColumn("LATITUDE", df.LATITUDE.cast(FloatType()))
    df = df.withColumn("LONGITUDE", df.LONGITUDE.cast(FloatType()))
    df = df.withColumn("Duration", df.Duration.cast(IntegerType()))
    categorical_Columns = ['FIRE_SIZE_CLASS']
    total_indexFeatures = []

    #Converting the Categororical variable from category to one hot encoding
    for categ_col in categorical_Columns:
        catIndex = StringIndexer(inputCol=categ_col,
                                 outputCol=categ_col + 'Index')
        catEn = OneHotEncoderEstimator(inputCols=[catIndex.getOutputCol()],
                                       outputCols=[categ_col + "classVec"])
        total_indexFeatures += [catIndex, catEn]

    #Creating the Correlation Imgae between the features and saving
    numeric_data = df.select('LATITUDE', 'LONGITUDE', 'STAT_CAUSE_CODE',
                             'FIRE_YEAR', 'Duration').toPandas()
    corelationVariables(numeric_data)

    #Target variable that needs to be predicted
    label = StringIndexer(inputCol='STAT_CAUSE_CODE',
                          outputCol='label').setHandleInvalid("keep")
    total_indexFeatures += [label]

    #Extracting the continuos features in the dataset
    continuous_features = ['LATITUDE', 'LONGITUDE']

    #combining continuos and category variables
    return features_conversion(categorical_Columns, continuous_features,
                               total_indexFeatures, df)
def preprocessing(sp_df):
    df = sp_df
    df = under_sample(df)
    #Taking the columns that are highly correlated between them
    df = df.select(df.columns[19:29] + df.columns[30:32] + df.columns[34:36])

    #Drop the null and na values from dataset
    df = df.na.drop()

    #Using the Duration day and containment day calculating the Creating Duration feature
    df = df.withColumn('Duration', (df['CONT_DOY'] - df['DISCOVERY_DOY'] + 1))
    df = df.drop("FIRE_YEAR", "DISCOVERY_DATE", "DISCOVERY_DOY",
                 "DISCOVERY_TIME", "CONT_DATE", 'CONT_TIME',
                 'STAT_CAUSE_DESCR', 'CONT_DAY', 'CONT_DOY')

    #Converting all to respective types from String DataType
    df = df.withColumn("STAT_CAUSE_CODE",
                       df.STAT_CAUSE_CODE.cast(IntegerType()))
    df = df.withColumn("FIRE_SIZE", df.FIRE_SIZE.cast(IntegerType()))
    df = df.withColumn("LATITUDE", df.LATITUDE.cast(FloatType()))
    df = df.withColumn("LONGITUDE", df.LONGITUDE.cast(FloatType()))
    df = df.withColumn("COUNTY", df.COUNTY.cast(IntegerType()))
    df = df.withColumn("Duration", df.Duration.cast(IntegerType()))
    categorical_Columns = ['STATE']
    total_indexFeatures = []

    #Converting the Categororical variable from category to one hot encoding
    for categ_col in categorical_Columns:
        catIndex = StringIndexer(inputCol=categ_col,
                                 outputCol=categ_col + 'Index')
        catEn = OneHotEncoderEstimator(inputCols=[catIndex.getOutputCol()],
                                       outputCols=[categ_col + "classVec"])
        total_indexFeatures += [catIndex, catEn]

    #Creating the Correlation Imgae between the features and saving
    numeric_data = df.select('STAT_CAUSE_CODE', 'LATITUDE', 'LONGITUDE',
                             'COUNTY', 'Duration').toPandas()
    corelationVariables(numeric_data)

    #Target variable that needs to be predicted
    label = StringIndexer(inputCol='FIRE_SIZE',
                          outputCol='label').setHandleInvalid("keep")
    total_indexFeatures += [label]

    #Extracting the continuos features in the dataset
    continuous_features = [
        'STAT_CAUSE_CODE', 'LATITUDE', 'LONGITUDE', 'COUNTY', 'Duration'
    ]
    return features_conversion(categorical_Columns, continuous_features,
                               total_indexFeatures, df)
def under_sample(df):
    window = Window.partitionBy(df['OBJECTID']).orderBy(
        df['STAT_CAUSE_CODE'].desc())
    df = df.select(
        '*',
        rank().over(window).alias('rank')).filter(col('rank') <= 150000)
    return df
Ejemplo n.º 5
0
def frequent_parks_count_df(filename):
    '''
    Write a Python script using DataFrames that prints the list of the 10 parks
    with the highest number of treated trees. Parks must be ordered by
    decreasing number of treated trees and by alphabetical order when they have
    similar number.  Every list element must be printed on a new line.
    Test file: tests/test_frequent_parks_count_df.py
    Note: The return value should be a CSV string.
          Have a look at the file *tests/frequent.txt* to get the exact return format.
    '''

    spark = init_spark()

    # ADD YOUR CODE HERE
    df = spark.read.csv(filename, header=True)
    df = df.select('Nom_parc').filter(df.Nom_parc != '')
    df = df.sort('Nom_parc', ascending=True)
    df = df.groupBy('Nom_parc').count()
    df = df.sort('count', ascending=False)
    df = df.limit(10)
    output = ""
    output = toCSVLine(df)
    return output

    raise Exception("Not implemented yet")
def data_conversion(df):
    df = df.withColumn('Duration', (df['CONT_DOY'] - df['DISCOVERY_DOY'] + 1))
    df = df.select('LATITUDE', 'LONGITUDE', "FIRE_SIZE", "Duration")
    df = df.withColumn("FIRE_SIZE", df.FIRE_SIZE.cast(IntegerType()))
    df = df.withColumn("LATITUDE", df.LATITUDE.cast(FloatType()))
    df = df.withColumn("LONGITUDE", df.LONGITUDE.cast(FloatType()))
    df = df.withColumn("Duration", df.Duration.cast(IntegerType()))
    df = df.na.drop()
    data = df.toPandas()
    return data
def findBestK(sp_df):
    df = sp_df

    # Preprocessing the data
    df = df.select('OBJECTID', 'LATITUDE', 'LONGITUDE')
    df = df.withColumn("LATITUDE", df["LATITUDE"].cast(FloatType()))
    df = df.withColumn("LONGITUDE", df["LONGITUDE"].cast(FloatType()))
    df = df.na.drop()

    # Finding the number of clusters is the elbow method
    K_clusters = range(3, 10)
    kmeans = [KMeans(n_clusters=i) for i in K_clusters]
    X_axis = df.select('LATITUDE').toPandas()
    Y_axis = df.select('LONGITUDE').toPandas()
    score = [kmeans[i].fit(X_axis).score(Y_axis) for i in range(len(kmeans))]

    # Visualize k value
    plt.plot(K_clusters, score)
    plt.xlabel('Number of Clusters')
    plt.ylabel('Score')
    plt.title('Elbow Curve')
    plt.show()
Ejemplo n.º 8
0
def parks_df(filename):
    '''
    Write a Python script using DataFrames that prints the number of trees that are *located in a park*.
    To get the park location information, have a look at the *Nom_parc* column (name of park).
    Test file: tests/test_parks_df.py
    Note: The return value should be an integer
    '''

    spark = init_spark()

    # ADD YOUR CODE HERE
    df = spark.read.csv(filename, header=True)
    df = df.select('Nom_parc').filter(df.Nom_parc != '')
    return df.count()

    raise Exception("Not implemented yet")
def kmeans(sp_df):
    df = sp_df
    df = df.select('OBJECTID', 'LATITUDE', 'LONGITUDE')
    df = df.withColumn("LATITUDE", df["LATITUDE"].cast(FloatType()))
    df = df.withColumn("LONGITUDE", df["LONGITUDE"].cast(FloatType()))
    df = df.na.drop()
    X = df.toPandas()
    kmeans = KMeans(n_clusters=6, init='k-means++')
    kmeans.fit(X[X.columns[1:3]])
    X['cluster_label'] = kmeans.fit_predict(X[X.columns[1:3]])

    # Visualize the Results
    centers = kmeans.cluster_centers_
    labels = kmeans.predict(X[X.columns[1:3]])
    X.plot.scatter(x='LATITUDE', y='LONGITUDE', c=labels, s=50, cmap='viridis')
    plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=0.5)
    plt.show()
def features_conversion(categorical_Columns, continuous_features,
                        total_indexFeatures, df):
    cols = df.columns
    #combining continuos and category variables
    totalFeatures = [c + "classVec"
                     for c in categorical_Columns] + continuous_features
    tFeatures = VectorAssembler(inputCols=totalFeatures, outputCol="features")
    total_indexFeatures += [tFeatures]

    #Creating the pipeling for the transform
    pipeline = Pipeline(stages=total_indexFeatures)
    df = df.na.drop()
    pipelineModel = pipeline.fit(df)
    df = pipelineModel.transform(df)

    #Only selected columns are used from the dataset
    selectedCols = ['label', 'features'] + cols
    df = df.select(selectedCols)
    return df
Ejemplo n.º 11
0
def uniq_parks_df(filename):
    '''
    Write a Python script using DataFrames that prints the list of unique parks
    where trees were treated. The list must be ordered alphabetically. Every
    element in the list must be printed on a new line.
    Test file: tests/test_uniq_parks_df.py
    Note: The return value should be a CSV string
    '''

    spark = init_spark()

    # ADD YOUR CODE HERE
    df = spark.read.csv(filename, header=True)
    df = df.select('Nom_parc').filter(df.Nom_parc != '').distinct()
    df = df.sort('Nom_parc', ascending=True)
    output = ""
    output = toCSVLine(df)
    return output

    raise Exception("Not implemented yet")
Ejemplo n.º 12
0
def uniq_parks_counts_df(filename):
    '''
    Write a Python script using DataFrames that counts the number of trees
    treated in each park and prints a list of "park,count" pairs in a CSV
    manner ordered alphabetically by the park name. Every element in the list
    must be printed on a new line.
    Test file: tests/test_uniq_parks_counts_df.py
    Note: The return value should be a CSV string
          Have a look at the file *tests/list_parks_count.txt* to get the exact return format.
    '''

    spark = init_spark()

    # ADD YOUR CODE HERE
    df = spark.read.csv(filename, header=True)
    df = df.select('Nom_parc').filter(df.Nom_parc != '')
    df = df.sort('Nom_parc', ascending=True)
    df = df.groupBy('Nom_parc').count()
    output = ""
    output = toCSVLine(df)
    return output

    raise Exception("Not implemented yet")