예제 #1
0
import pandas as pd
import numpy as np
from common import date2int, inputdir

CROSS_VALIDATION = True

df = pd.read_csv(inputdir + 'flat.csv', encoding='cp1251')
df['sale'] = date2int(df['sale'])
df['date_salestart'] = date2int(df['date_salestart'])

if CROSS_VALIDATION:
    future_date = pd.DatetimeIndex(['2017-10-01']).astype(np.int64)[0]
else:
    future_date = pd.DatetimeIndex(['2019-10-01']).astype(np.int64)[0]

# group by 'id_bulk', 'spalen'


def ff(x):
    id_bulk = x['id_bulk'].values[0]
    spalen = x['spalen'].values[0]

    month_num = (x['date_salestart'].values[0] - future_date) / (86400 * 30)

    not_sold = np.sum(x[x['sale'] >= future_date]['square'].values)

    avg_sale_date = np.median(x[x['sale'] < future_date]['sale'].values)
    return pd.Series([id_bulk, spalen, not_sold, avg_sale_date],
                     index=['id_bulk', 'spalen', 'not_sold', 'avg_sale_date'])

예제 #2
0
q_periods = {
    ('2015-08-01', '2015-08-01'): ['2015-08-01', '2015-09-01', '2015-10-01'],
    ('2015-08-01', '2015-11-01'): ['2015-11-01', '2015-12-01', '2016-01-01'],
    ('2015-11-01', '2016-02-01'): ['2016-02-01', '2016-03-01', '2016-04-01'],
    ('2016-02-01', '2016-05-01'): ['2016-05-01', '2016-06-01', '2016-07-01'],
    ('2016-05-01', '2016-08-01'): ['2016-08-01', '2016-09-01', '2016-10-01'],
    ('2016-08-01', '2016-11-01'): ['2016-11-01', '2016-12-01', '2017-01-01'],
    ('2016-11-01', '2017-02-01'): ['2017-02-01', '2017-03-01', '2017-04-01'],
    ('2017-02-01', '2017-05-01'): ['2017-05-01', '2017-06-01', '2017-07-01'],
    ('2017-05-01', '2017-08-01'): ['2017-08-01', '2017-09-01', '2017-10-01'],
    ('2017-08-01', '2017-11-01'): ['2017-11-01', '2017-12-01', '2018-01-01'],
    ('2017-11-01', '2018-02-01'): ['2018-02-01', '2018-03-01', '2018-04-01']
}

stat_df['dateto'] = date2int(stat_df['dateto'])
stat_df['datefrom'] = date2int(stat_df['datefrom'])
flat_df['sale'] = date2int(flat_df['sale'])

# what we want:
# bulk_id, spalen, date1, ratio of status 03 for bulk_id
flat_df = pd.merge(left=flat_df, right=stat_df, on='id_flatwork', how='left')

what_we_want = flat_df[['id_bulk', 'spalen']].drop_duplicates()
ff = None

for d1, d123 in q_periods.items():
    for dd in d123:
        new_one = what_we_want.copy()
        new_one['date1'] = pd.DatetimeIndex([dd]).astype(np.int64)[0]
        if ff is None:
예제 #3
0
import pandas as pd
import numpy as np
from common import date2int, inputdir, unique_print


train = pd.read_csv(inputdir+'train.csv', encoding='cp1251')
test = pd.read_csv(inputdir+'test.csv', encoding='cp1251')

train['date1'] = date2int(train['date1'])
test['date1'] = date2int(test['date1'])

super_tt = pd.concat([train[['date1', 'bulk_id', 'spalen', 'price']], test[['date1', 'bulk_id', 'spalen', 'price']]], ignore_index=True)
super_tt = super_tt.drop_duplicates()

super_tt = super_tt.sort_values(by=['date1', 'bulk_id', 'spalen'])


months_time = [
    '2015-07-01',
    '2015-08-01',
    '2015-09-01',
    '2015-10-01',
    '2015-11-01',
    '2015-12-01',
    '2016-01-01',
    '2016-02-01',
    '2016-03-01',
    '2016-04-01',
    '2016-05-01',
    '2016-06-01',
    '2016-07-01',