def main():
    data = load_immobilien_data()

    round_space = np.vectorize(lambda v: int(round(v / 10.0) * 10))
    data["living_space"] = round_space(data["living_space"])

    round_space = np.vectorize(lambda s: int(round(s / 10000.0) * 10000))
    data["price"] = round_space(data["price"])

    figure = pl.figure(1)
    figure.clf()

    ax = figure.add_subplot(111)

    labels = dict([(d, i) for i, d in enumerate(data["distict"].unique())])
    ncolors = len(labels)

    for n, g in data.groupby("distict"):
        ax.scatter(
            g["living_space"],
            g["price"],
            c=np.random.rand(ncolors, 1),
            s=(labels[n] + 2) * 3,
            label=n.decode("utf-8"),
            alpha=0.75,
        )

    ax.set_xlabel("Living space (sqm)")
    ax.set_ylabel("Price (EUR)")

    ax.grid(True)

    pl.legend(shadow=True, fancybox=True, loc=0, scatterpoints=1)

    pl.show()
def main():
    data = load_immobilien_data()

    round_feature = np.vectorize(lambda v: int(round(v)))
    data['number_of_rooms'] = round_feature(data['number_of_rooms'])

    figure = pl.figure(1)
    figure.clf()

    pic = 321

    for distict in disticts:

        sdata = data[data['district'] == distict.encode('utf-8')]

        ncolors = len(sdata['number_of_rooms'].unique())

        ax = figure.add_subplot(pic)
        for n, g in sdata.groupby('number_of_rooms'):
            ax.scatter(g['living_space'], g['price'],
                c=np.random.rand(ncolors, 1), s=n*4, label='%s rooms' % n)

        ax.set_xlabel('Living space (sqm)')
        ax.set_ylabel('Price (EUR)')
        ax.set_title(distict)

        ax.grid(True)
        ax.legend(loc=0, scatterpoints=1)

        pic += 1

    pl.show()
def main():
    data = load_immobilien_data()

    round_feature = np.vectorize(lambda v: int(round(v)))
    data['number_of_rooms'] = round_feature(data['number_of_rooms'])

    figure = pl.figure(1)
    figure.clf()

    ax = figure.add_subplot(111)

    ncolors = len(data['number_of_rooms'].unique())

    for n, g in data.groupby('number_of_rooms'):
        ax.scatter(g['living_space'], g['price'], c=np.random.rand(ncolors, 1),
                s=n*6, label='%s rooms' % n)

    ax.set_xlabel('Living space (sqm)')
    ax.set_ylabel('Price (EUR)')

    ax.grid(True)

    pl.legend(shadow=True, fancybox=True, loc=0, scatterpoints=1)

    pl.show()
def main():
    data = load_immobilien_data()

    figure = pl.figure(1)

    show_hist(data['price'], 'Price', 'EUR', figure, 331)
    show_hist(data['living_space'], 'Living space', 'sqm', figure, 332)
    show_hist(data['number_of_rooms'], 'Number of rooms', 'rooms', figure, 333)

    sdata = preprocessing.scale(
            data[['price', 'living_space', 'number_of_rooms']])

    show_hist(sdata[:,0], 'Price / scaled', 'EUR', figure, 334)
    show_hist(sdata[:,1], 'Living space / scaled', '', figure, 335)
    show_hist(sdata[:,2], 'Number of rooms / scaled', '', figure, 336)

    min_max_scaler = preprocessing.MinMaxScaler()
    sdata = min_max_scaler.fit_transform(
            data[['price', 'living_space', 'number_of_rooms']])

    show_hist(sdata[:,0], 'Price / min max scaled', 'EUR', figure, 337)
    show_hist(sdata[:,1], 'Living space / min max scaled', '', figure, 338)
    show_hist(sdata[:,2], 'Number of rooms / min max scaled', '', figure, 339)

    pl.show()
def main():
    data = load_immobilien_data()

    figure = pl.figure()
    ax = figure.add_subplot(111, projection='3d')

    ax.scatter(data['price'], data['living_space'], data['number_of_rooms'])

    ax.set_xlabel('price')
    ax.set_ylabel('living_space')
    ax.set_zlabel('number_of_rooms')

    pl.show()
Beispiel #6
0
def split_data(district=None):
    data = load_immobilien_data()

    if district:
        data = data[data['district'] == district]

    m = data.shape[0]

    # 30% of the data - test set
    test_rows = random.sample(data.index, (m * 30 / 100))
    test_set = data.ix[test_rows]
    train_set = data.drop(test_rows)

    return train_set, test_set
Beispiel #7
0
def main():
    """
    In order to test for normality let's plot:

    * histogram with the best fitting normal curve - check if it's
        bell curve
    * normal probability plot - should be linear
    """

    data = load_immobilien_data()

    ax = pl.subplot(121)
    plot_histogram(data.price, ax)

    pl.subplot(122)
    plot_normal_probability(data.price)

    pl.show()
Beispiel #8
0
def main():
    data = load_immobilien_data()

    figure = pl.figure(1)

    show_hist(data['price'], 'Price', 'EUR', figure, 331)
    show_hist(np.log(data['price']), 'log price', 'log(EUR)', figure, 332)
    show_hist(np.sqrt(data['price']), 'sqrt price', 'sqrt(EUR)', figure, 333)

    show_hist(data['living_space'], 'Living space', 'sqm', figure, 334)
    living_space = data[data['living_space'] > 1]['living_space']
    show_hist(np.log(living_space), 'log living space', 'log(sqm)', figure, 335)
    show_hist(np.sqrt(data['living_space']), 'sqrt living space', 'sqrt(sqm)',
            figure, 336)

    show_hist(data['number_of_rooms'], 'Numbers of rooms', 'rooms', figure, 337)
    show_hist(np.log(data['number_of_rooms']), 'log(rooms)', 'log(rooms)',
            figure, 338)
    show_hist(np.sqrt(data['number_of_rooms']), 'sqrt(rooms)', 'sqrt(rooms)',
            figure, 339)

    pl.show()
def main():
    data = load_immobilien_data()

    data =  data.sort(['price']).groupby('district')['price']

    data = [(d.decode('utf-8'), np.array(g)) for d, g in data]
    data.sort(key=lambda (d, p): p.mean(), reverse=True)

    prices = [p for d, p in data]
    labels = [d for d, p in data]

    #pl.boxplot(prices, 0, 'gD')
    pl.boxplot(prices, 0, '')

    pl.ylabel('Price in EUR')
    pl.xlabel('Distict')

    pl.xticks(np.arange(1, len(labels) + 1), labels, rotation=90)
    pl.title('Berlin appartment prices')

    pl.subplots_adjust(bottom=0.2, right=0.8, top=0.92)

    pl.show()
Beispiel #10
0
def main():
    data = load_immobilien_data()

    pl.boxplot(data['price'], 0, 'gD')
    pl.show()
Beispiel #11
0
def mad(data):
    """Median Absolute Deviation"""

    # make it normally distributed
    log_data = np.log(data)

    # the number of standart deviations to include
    m = 4
    index = abs(log_data - np.mean(log_data)) < m * np.std(log_data)
    return index

def remove_suspicious_data(data):
    data = data[np.logical_and(
        data['number_of_rooms'] > 0., data['living_space'] > 0.)]

    index = mad(data['price'])
    data = data[index]

    return data

if __name__ == '__main__':
    data = load_immobilien_data()

    data = remove_suspicious_data(data)

    encoder = preprocessing.LabelEncoder()
    data['district'] = encoder.fit_transform(data['district'])

    print data['district']