import pandas as pd
import data_wrangling.dataframe_manager as dm
import scipy.stats as scs

pd.set_option('display.max_columns', 200)
location = "https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/DA0101EN/auto.csv"
headers = ["symboling", "normalized-losses", "make", "fuel-type", "aspiration", "num-of-doors", "body-style",
           "drive-wheels", "engine-location", "wheel-base", "length", "width", "height", "curb-weight", "engine-type",
           "num-of-cylinders", "engine-size", "fuel-system", "bore", "stroke", "compression-ratio", "horsepower",
           "peak-rpm", "city-mpg", "highway-mpg", "price"]
df = dm.create_df(location, headers)
print(dm.print_4_heads(df, ""))
dm.replace_cols_with_nan(df, ["price", "horsepower"])
dm.replace_cols_with_mean(df, ["price", "horsepower"])
df["price"] = df["price"].astype("float")
df["horsepower"] = df['horsepower'].astype('float')

#PEARSON Correlation
pearson_coeff, p_value = scs.pearsonr(df['horsepower'], df['price'])
print('pearson_coeff: ', pearson_coeff)
print('p_value: ', p_value)
import data_wrangling.dataframe_manager as dm

pd.set_option('display.max_columns', 200)

location = "https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/DA0101EN/auto.csv"
headers = ["symboling", "normalized-losses", "make", "fuel-type", "aspiration", "num-of-doors", "body-style",
           "drive-wheels", "engine-location", "wheel-base", "length", "width", "height", "curb-weight", "engine-type",
           "num-of-cylinders", "engine-size", "fuel-system", "bore", "stroke", "compression-ratio", "horsepower",
           "peak-rpm", "city-mpg", "highway-mpg", "price"]
df = dm.create_df(location, headers)
print(dm.print_4_heads(df, ""))

# replacing price and peak-rpm missing values
print('\n Replacing rows where values are not available')
df = dm.replace_cols_with_nan(df, ['price', 'peak-rpm'])
df = dm.replace_cols_with_mean(df, ['price', 'peak-rpm'])

print('\nChanging dtype for price and peak-rpm from object to float64: ')
df[["price", "peak-rpm"]] = df[["price", "peak-rpm"]].astype('float')
#df[["price", "peak-rpm"]] = dm.change_col_types(df, ["price", "peak-rpm"], 'float')



#NORMALIZATION

# normalizing the price with Simple Feature Scaling
df['price'] = df['price'] / df['price'].max()
print(dm.print_4_heads(df, "normalizing the price with Simple Feature Scaling"))

# normalizing the peak-rpm with Min-Max Method
df['peak-rpm'] = (df['peak-rpm'] - df['peak-rpm'].min()) / (df['peak-rpm'].max() - df['peak-rpm'].min())
Beispiel #3
0
import pandas as pd
import data_wrangling.dataframe_manager as dm

pd.set_option('display.max_columns', 200)
location = "https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/DA0101EN/auto.csv"
headers = [
    "symboling", "normalized-losses", "make", "fuel-type", "aspiration",
    "num-of-doors", "body-style", "drive-wheels", "engine-location",
    "wheel-base", "length", "width", "height", "curb-weight", "engine-type",
    "num-of-cylinders", "engine-size", "fuel-system", "bore", "stroke",
    "compression-ratio", "horsepower", "peak-rpm", "city-mpg", "highway-mpg",
    "price"
]
df = dm.create_df(location, headers)
print(dm.print_4_heads(df, ""))
dm.replace_cols_with_nan(df, ["price"])
dm.replace_cols_with_mean(df, ["price"])
df["price"] = df["price"].astype("float")

#GROUP BY
print('\n')
print("GROUP BY DEMO")
df_test = df[['drive-wheels', 'body-style', 'price']]
df_grp = df_test.groupby(['drive-wheels', 'body-style']).mean()
print(df_grp)

#PIVOT
print('\n')
print("PIVOT Table DEMO")
df_pivot = df_grp.pivot(index='drive-wheels', columns='body-style')
print(df_pivot)