import pandas as pd import numpy as np from common import date2int, inputdir CROSS_VALIDATION = True df = pd.read_csv(inputdir + 'flat.csv', encoding='cp1251') df['sale'] = date2int(df['sale']) df['date_salestart'] = date2int(df['date_salestart']) if CROSS_VALIDATION: future_date = pd.DatetimeIndex(['2017-10-01']).astype(np.int64)[0] else: future_date = pd.DatetimeIndex(['2019-10-01']).astype(np.int64)[0] # group by 'id_bulk', 'spalen' def ff(x): id_bulk = x['id_bulk'].values[0] spalen = x['spalen'].values[0] month_num = (x['date_salestart'].values[0] - future_date) / (86400 * 30) not_sold = np.sum(x[x['sale'] >= future_date]['square'].values) avg_sale_date = np.median(x[x['sale'] < future_date]['sale'].values) return pd.Series([id_bulk, spalen, not_sold, avg_sale_date], index=['id_bulk', 'spalen', 'not_sold', 'avg_sale_date'])
q_periods = { ('2015-08-01', '2015-08-01'): ['2015-08-01', '2015-09-01', '2015-10-01'], ('2015-08-01', '2015-11-01'): ['2015-11-01', '2015-12-01', '2016-01-01'], ('2015-11-01', '2016-02-01'): ['2016-02-01', '2016-03-01', '2016-04-01'], ('2016-02-01', '2016-05-01'): ['2016-05-01', '2016-06-01', '2016-07-01'], ('2016-05-01', '2016-08-01'): ['2016-08-01', '2016-09-01', '2016-10-01'], ('2016-08-01', '2016-11-01'): ['2016-11-01', '2016-12-01', '2017-01-01'], ('2016-11-01', '2017-02-01'): ['2017-02-01', '2017-03-01', '2017-04-01'], ('2017-02-01', '2017-05-01'): ['2017-05-01', '2017-06-01', '2017-07-01'], ('2017-05-01', '2017-08-01'): ['2017-08-01', '2017-09-01', '2017-10-01'], ('2017-08-01', '2017-11-01'): ['2017-11-01', '2017-12-01', '2018-01-01'], ('2017-11-01', '2018-02-01'): ['2018-02-01', '2018-03-01', '2018-04-01'] } stat_df['dateto'] = date2int(stat_df['dateto']) stat_df['datefrom'] = date2int(stat_df['datefrom']) flat_df['sale'] = date2int(flat_df['sale']) # what we want: # bulk_id, spalen, date1, ratio of status 03 for bulk_id flat_df = pd.merge(left=flat_df, right=stat_df, on='id_flatwork', how='left') what_we_want = flat_df[['id_bulk', 'spalen']].drop_duplicates() ff = None for d1, d123 in q_periods.items(): for dd in d123: new_one = what_we_want.copy() new_one['date1'] = pd.DatetimeIndex([dd]).astype(np.int64)[0] if ff is None:
import pandas as pd import numpy as np from common import date2int, inputdir, unique_print train = pd.read_csv(inputdir+'train.csv', encoding='cp1251') test = pd.read_csv(inputdir+'test.csv', encoding='cp1251') train['date1'] = date2int(train['date1']) test['date1'] = date2int(test['date1']) super_tt = pd.concat([train[['date1', 'bulk_id', 'spalen', 'price']], test[['date1', 'bulk_id', 'spalen', 'price']]], ignore_index=True) super_tt = super_tt.drop_duplicates() super_tt = super_tt.sort_values(by=['date1', 'bulk_id', 'spalen']) months_time = [ '2015-07-01', '2015-08-01', '2015-09-01', '2015-10-01', '2015-11-01', '2015-12-01', '2016-01-01', '2016-02-01', '2016-03-01', '2016-04-01', '2016-05-01', '2016-06-01', '2016-07-01',