def main(): data = load_immobilien_data() round_space = np.vectorize(lambda v: int(round(v / 10.0) * 10)) data["living_space"] = round_space(data["living_space"]) round_space = np.vectorize(lambda s: int(round(s / 10000.0) * 10000)) data["price"] = round_space(data["price"]) figure = pl.figure(1) figure.clf() ax = figure.add_subplot(111) labels = dict([(d, i) for i, d in enumerate(data["distict"].unique())]) ncolors = len(labels) for n, g in data.groupby("distict"): ax.scatter( g["living_space"], g["price"], c=np.random.rand(ncolors, 1), s=(labels[n] + 2) * 3, label=n.decode("utf-8"), alpha=0.75, ) ax.set_xlabel("Living space (sqm)") ax.set_ylabel("Price (EUR)") ax.grid(True) pl.legend(shadow=True, fancybox=True, loc=0, scatterpoints=1) pl.show()
def main(): data = load_immobilien_data() round_feature = np.vectorize(lambda v: int(round(v))) data['number_of_rooms'] = round_feature(data['number_of_rooms']) figure = pl.figure(1) figure.clf() pic = 321 for distict in disticts: sdata = data[data['district'] == distict.encode('utf-8')] ncolors = len(sdata['number_of_rooms'].unique()) ax = figure.add_subplot(pic) for n, g in sdata.groupby('number_of_rooms'): ax.scatter(g['living_space'], g['price'], c=np.random.rand(ncolors, 1), s=n*4, label='%s rooms' % n) ax.set_xlabel('Living space (sqm)') ax.set_ylabel('Price (EUR)') ax.set_title(distict) ax.grid(True) ax.legend(loc=0, scatterpoints=1) pic += 1 pl.show()
def main(): data = load_immobilien_data() round_feature = np.vectorize(lambda v: int(round(v))) data['number_of_rooms'] = round_feature(data['number_of_rooms']) figure = pl.figure(1) figure.clf() ax = figure.add_subplot(111) ncolors = len(data['number_of_rooms'].unique()) for n, g in data.groupby('number_of_rooms'): ax.scatter(g['living_space'], g['price'], c=np.random.rand(ncolors, 1), s=n*6, label='%s rooms' % n) ax.set_xlabel('Living space (sqm)') ax.set_ylabel('Price (EUR)') ax.grid(True) pl.legend(shadow=True, fancybox=True, loc=0, scatterpoints=1) pl.show()
def main(): data = load_immobilien_data() figure = pl.figure(1) show_hist(data['price'], 'Price', 'EUR', figure, 331) show_hist(data['living_space'], 'Living space', 'sqm', figure, 332) show_hist(data['number_of_rooms'], 'Number of rooms', 'rooms', figure, 333) sdata = preprocessing.scale( data[['price', 'living_space', 'number_of_rooms']]) show_hist(sdata[:,0], 'Price / scaled', 'EUR', figure, 334) show_hist(sdata[:,1], 'Living space / scaled', '', figure, 335) show_hist(sdata[:,2], 'Number of rooms / scaled', '', figure, 336) min_max_scaler = preprocessing.MinMaxScaler() sdata = min_max_scaler.fit_transform( data[['price', 'living_space', 'number_of_rooms']]) show_hist(sdata[:,0], 'Price / min max scaled', 'EUR', figure, 337) show_hist(sdata[:,1], 'Living space / min max scaled', '', figure, 338) show_hist(sdata[:,2], 'Number of rooms / min max scaled', '', figure, 339) pl.show()
def main(): data = load_immobilien_data() figure = pl.figure() ax = figure.add_subplot(111, projection='3d') ax.scatter(data['price'], data['living_space'], data['number_of_rooms']) ax.set_xlabel('price') ax.set_ylabel('living_space') ax.set_zlabel('number_of_rooms') pl.show()
def split_data(district=None): data = load_immobilien_data() if district: data = data[data['district'] == district] m = data.shape[0] # 30% of the data - test set test_rows = random.sample(data.index, (m * 30 / 100)) test_set = data.ix[test_rows] train_set = data.drop(test_rows) return train_set, test_set
def main(): """ In order to test for normality let's plot: * histogram with the best fitting normal curve - check if it's bell curve * normal probability plot - should be linear """ data = load_immobilien_data() ax = pl.subplot(121) plot_histogram(data.price, ax) pl.subplot(122) plot_normal_probability(data.price) pl.show()
def main(): data = load_immobilien_data() figure = pl.figure(1) show_hist(data['price'], 'Price', 'EUR', figure, 331) show_hist(np.log(data['price']), 'log price', 'log(EUR)', figure, 332) show_hist(np.sqrt(data['price']), 'sqrt price', 'sqrt(EUR)', figure, 333) show_hist(data['living_space'], 'Living space', 'sqm', figure, 334) living_space = data[data['living_space'] > 1]['living_space'] show_hist(np.log(living_space), 'log living space', 'log(sqm)', figure, 335) show_hist(np.sqrt(data['living_space']), 'sqrt living space', 'sqrt(sqm)', figure, 336) show_hist(data['number_of_rooms'], 'Numbers of rooms', 'rooms', figure, 337) show_hist(np.log(data['number_of_rooms']), 'log(rooms)', 'log(rooms)', figure, 338) show_hist(np.sqrt(data['number_of_rooms']), 'sqrt(rooms)', 'sqrt(rooms)', figure, 339) pl.show()
def main(): data = load_immobilien_data() data = data.sort(['price']).groupby('district')['price'] data = [(d.decode('utf-8'), np.array(g)) for d, g in data] data.sort(key=lambda (d, p): p.mean(), reverse=True) prices = [p for d, p in data] labels = [d for d, p in data] #pl.boxplot(prices, 0, 'gD') pl.boxplot(prices, 0, '') pl.ylabel('Price in EUR') pl.xlabel('Distict') pl.xticks(np.arange(1, len(labels) + 1), labels, rotation=90) pl.title('Berlin appartment prices') pl.subplots_adjust(bottom=0.2, right=0.8, top=0.92) pl.show()
def main(): data = load_immobilien_data() pl.boxplot(data['price'], 0, 'gD') pl.show()
def mad(data): """Median Absolute Deviation""" # make it normally distributed log_data = np.log(data) # the number of standart deviations to include m = 4 index = abs(log_data - np.mean(log_data)) < m * np.std(log_data) return index def remove_suspicious_data(data): data = data[np.logical_and( data['number_of_rooms'] > 0., data['living_space'] > 0.)] index = mad(data['price']) data = data[index] return data if __name__ == '__main__': data = load_immobilien_data() data = remove_suspicious_data(data) encoder = preprocessing.LabelEncoder() data['district'] = encoder.fit_transform(data['district']) print data['district']