Esempio n. 1
0
print('HDB Data Preprocessing')

#train
df = pd.read_csv("../Input/hdb_train.csv")
flat_model = list(set(df.flat_model.values))
df['monthtx'] = df['month'].map(
    lambda x: int(x.split('-')[0]) * 12 + int(x.split('-')[1]))

#df = df.loc[df.monthtx >= 24121]
df = df.loc[df.resale_price <= 30000000]
df = df.loc[df.floor_area_sqm <= 7000]
df = df.loc[df.latitude < 10]

df = df.filter(items=features_train)

df['flat_model'] = df['flat_model'].astype(cattype(categories=flat_model))

df = pd.get_dummies(df, drop_first=True, columns=['flat_model'])
df = df.reset_index(drop=True)

monthly_mean = df.groupby(['monthtx'])['resale_price'].mean()
monthly_mean = pd.DataFrame(monthly_mean).reset_index()
monthly_mean.columns = ['monthtx', 'mean_resale_price']

monthly_mean.to_csv("df_hdb_mean_time.csv")

df = df.merge(monthly_mean, left_on=['monthtx'], right_on=['monthtx'])

df['diff_mean'] = df.resale_price.values - df.mean_resale_price.values

pickle.dump(df, open("df_hdb_train.p", "wb"))
Esempio n. 2
0
	value[town] = {'floor': df.loc[df.town == town].floor.median()}
	df.loc[df.town == town] = df.loc[df.town == town].fillna(value = value[town])
df = df.fillna(value = {'floor': df.floor.median()})

monthly_mean = df.groupby(['monthtx'])['resale_price'].mean()
monthly_mean = pd.DataFrame(monthly_mean).reset_index()
monthly_mean.columns = ['monthtx', 'mean_resale_price']


monthly_mean.to_csv("df_private_mean_time.csv")

df = df.merge(monthly_mean, left_on = ['monthtx'], right_on = ['monthtx'])

df['diff_mean'] = df.resale_price.values - df.mean_resale_price.values

df['property_type'] = df['property_type'].astype(cattype(categories=property_type))
df['type_of_sale'] = df['type_of_sale'].astype(cattype(categories=type_of_sale))
df['type_of_land'] = df['type_of_land'].astype(cattype(categories=type_of_land))
df = pd.get_dummies(df,drop_first=drop_first, columns=['type_of_land', 'type_of_sale', 'property_type'])

pickle.dump(df, open("df_private_train.p", "wb"))


#Test

df = pd.read_csv("../Input/private_test.csv")
df['monthtx'] = df['month'].map(lambda x: int(x.split('-')[0])*12+int(x.split('-')[1]))
df = df.filter(items=features_test)
df = df.merge(area_town_corresp, left_on = 'area', right_on = 'area')
df = df.drop(columns = ['area', 'Unnamed: 0'])
df = df.rename(columns = {'floor_num': 'floor'})
Esempio n. 3
0
df = df.filter(items=features_train)
df = df.reset_index(drop=True)
df = df.merge(area_town_corresp, left_on='area', right_on='area')
df = df.drop(columns=['area', 'Unnamed: 0'])
df = df.rename(columns={'price': 'resale_price', 'floor_num': 'floor'})

Towns = np.sort(list(set(df.town.values)))
value = {}
for town in Towns:
    value[town] = {'floor': df.loc[df.town == town].floor.median()}
    df.loc[df.town == town] = df.loc[df.town == town].fillna(value=value[town])
df = df.fillna(value={'floor': df.floor.median()})

df['property_type'] = df['property_type'].astype(
    cattype(categories=property_type))
df['type_of_sale'] = df['type_of_sale'].astype(
    cattype(categories=type_of_sale))
df['type_of_land'] = df['type_of_land'].astype(
    cattype(categories=type_of_land))
df = pd.get_dummies(df,
                    drop_first=drop_first,
                    columns=['type_of_land', 'type_of_sale', 'property_type'])

pickle.dump(df, open("df_private_train.p", "wb"))

#Test

df = pd.read_csv("../Input/private_test.csv")
df['monthtx'] = df['month'].map(
    lambda x: int(x.split('-')[0]) * 12 + int(x.split('-')[1]))