def _build_bins(): bin_fname = os.path.join(CACHE, 'binip.pkl') if os.path.exists(bin_fname): with open(bin_fname, 'rb') as f: bins = pickle.load(f) return bins # build full ips from train + test_v0 df = load_base('train') df = df.append(load_base('test_v0')) assert len(df) == TEST_ROWS_V0 + TRAIN_ROWS group = GROUP helper = 'is_attributed' out_column = 'binip' gp = df[group + [helper]].groupby(by=group)[helper] \ .count() \ .reset_index() \ .rename(index=str, columns={helper: out_column}) count = df[group].merge(gp, on=group, how='left') _, bins = pd.qcut(count[out_column], BINS, retbins=True, labels=False, duplicates='drop') info("==== bins:") info(bins) with open(bin_fname, 'wb') as f: pickle.dump(bins, f) return bins
def process(kind): for group in [ #['binip'], ['binip', 'hour'], ['binip', 'app', 'channel'], ['binip', 'app', 'channel', 'hour'], ['binip', 'app', 'os'], ['binip', 'app', 'os', 'hour'], ]: out_column = "x4_{}".format('_'.join(group)) out_fname = os.path.join(CACHE, '{}_{}.feather'.format(kind, out_column)) if os.path.exists(out_fname): continue info('loading base') df = load_base('train') binip = feather.read_dataframe(os.path.join(CACHE, '{}_binip.feather'.format('train'))) df['binip'] = binip['binip'].values del binip gc.collect() source_cond = (df.day.isin([6, 7, 9])) & df.hour.isin([4,5,9,10,13,14]) # source_cond = (df.day.isin([9])) & df.hour.isin([4,5,9,10,13,14]) source_df = df[source_cond] target_df = df if kind == 'test': target_df = load_base('test') binip = feather.read_dataframe(os.path.join(CACHE, '{}_binip.feather'.format(kind))) target_df['binip'] = binip['binip'].values del binip gc.collect() info('source: %d, target: %d' %(len(source_df), len(target_df))) print('preparing ', out_column, datetime.now()) out = _prepare_x4(kind, source_df, target_df, group, out_column, np.float32) info(out[out_column].describe()) feather.write_dataframe(out, out_fname) print('wrote ', out_fname) del out del df del source_df del target_df gc.collect() print('done ', datetime.now())
def process(kind): for group in [ # single features ['ip'], #['app'], ['device'], ['os'], ['channel'], # pairs ['app', 'channel'], ['app', 'os'], ['app', 'device'], #triple ['app', 'channel', 'hour'], ['app', 'os', 'hour'] ]: out_column = "x1_{}".format('_'.join(group)) out_fname = os.path.join(CACHE, '{}_{}.feather'.format(kind, out_column)) if os.path.exists(out_fname): continue info('loading train base') df = load_base('train') #controlled leak ^-^ #source_cond = (df.day.isin([8, 9])) & df.hour.isin([4,5,9,10,13,14]) source_cond = (df.day.isin([9])) & df.hour.isin([4, 5, 9, 10, 13, 14]) source_df = df[source_cond] target_df = df if kind == 'train' else load_base('test') info('source: %d, target: %d' % (len(source_df), len(target_df))) print('preparing ', out_column, datetime.now()) out = _prepare_x1(kind, source_df, target_df, group, out_column, np.float32) info(out[out_column].describe()) feather.write_dataframe(out, out_fname) print('wrote ', out_fname) del out del df del source_df del target_df gc.collect() print('done ', datetime.now())
def process(kind): out_column = 'binip' out_fname = os.path.join(CACHE, '{}_{}.feather'.format(kind, out_column)) if os.path.exists(out_fname): info('%s exists, skippping.' % out_fname) return info("preparing bins") bins = _build_bins() gc.collect() info("reading df") df = load_base(kind) info('preparing %s' % out_column) out = _prepare_binip(bins, df, out_column, kind) feather.write_dataframe(out, out_fname) print('wrote ', out_fname) del out del bins del df gc.collect() print('done ', datetime.now())
def process(kind): """ these ones did not improve val score. #['ip', 'day', 'in_test_hh'], #['ip', 'day', 'app', 'in_test_hh'], #['ip', 'day', 'device', 'in_test_hh'], #['app', 'day', 'in_test_hh'], #['channel', 'day', 'in_test_hh'], #['binip', 'os', 'hour'] """ for group in [ #['ip', 'day', 'hour'], ['ip', 'os', 'hour'], ['ip', 'app', 'hour'], ['ip', 'device', 'hour'], ['ip', 'app', 'channel', 'hour'], #['ip', 'day', 'app', 'hour'], ]: out_column = 'count_{}'.format('_'.join(group)) out_fname = os.path.join(CACHE, '{}_{}.feather'.format(kind, out_column)) if os.path.exists(out_fname): info("%s exists, skipping." % out_fname) continue df = load_base(kind) source_df = df source_cond = df.hour.isin([4,5,9,10,13,14]) if kind == 'train': source_cond = source_cond & df.day.isin([9]) source_df = df[source_cond] target_df = df info('source: %d, target: %d' %(len(source_df), len(target_df))) info('preparing %s %s' % (out_column, datetime.now())) out = _prepare_count(kind, source_df, target_df, group, out_column, np.float32) info(out.info()) info(out[out_column].describe()) feather.write_dataframe(out, out_fname) info('wrote %s' % out_fname) del out del source_df del target_df gc.collect() info('done %s' % datetime.now())
def process(kind): GROUPBY_AGGREGATIONS = [ #{'groupby': ['ip','app','channel'], 'select': 'day', 'agg': 'var'}, { 'groupby': ['ip', 'app', 'os'], 'select': 'hour', 'agg': 'var' }, #{'groupby': ['ip','day','channel'], 'select': 'hour', 'agg': 'var'}, #{'groupby': ['ip','day','hour'], 'select': 'channel', 'agg': 'count'}, { 'groupby': ['ip', 'app'], 'select': 'channel', 'agg': 'count' }, { 'groupby': ['ip', 'app', 'os'], 'select': 'channel', 'agg': 'count' }, #{'groupby': ['ip','app','day','hour'], 'select': 'channel', 'agg': 'count'}, { 'groupby': ['ip', 'app', 'channel'], 'select': 'hour', 'agg': 'mean' }, #{'groupby': ['app'], # 'select': 'ip', # 'agg': lambda x: float(len(x)) / len(x.unique()), # 'agg_name': 'AvgViewPerDistinct' #}, { 'groupby': ['app'], 'select': 'channel', 'agg': 'count' }, #{'groupby': ['channel'], 'select': 'app', 'agg': 'count'} ] # Apply all the groupby transformations for spec in GROUPBY_AGGREGATIONS: # Name of the aggregation we're applying agg_name = spec['agg_name'] if 'agg_name' in spec else spec['agg'] new_feature = '{}_{}_{}'.format('_'.join(spec['groupby']), agg_name, spec['select']) out_column = "x2_{}".format(new_feature) out_fname = os.path.join(CACHE, '{}_{}.feather'.format(kind, out_column)) if os.path.exists(out_fname): continue df = load_base('train') source_cond = (df.day.isin([9])) & df.hour.isin([4, 5, 9, 10, 13, 14]) source_df = df[source_cond] target_df = df if kind == 'train' else load_base('test') info('source: %d, target: %d' % (len(source_df), len(target_df))) # Info print("Grouping by {}, and aggregating {} with {}".format( spec['groupby'], spec['select'], agg_name)) # Unique list of features to select all_features = list(set(spec['groupby'] + [spec['select']])) # ... group_fname = 'x2_{}'.format('_'.join(new_feature)) group_fpath = os.path.join(CACHE, group_fname) if kind == 'train': print('preparing ', out_column, datetime.now()) # Perform the groupby gp = source_df[all_features]. \ groupby(spec['groupby'])[spec['select']]. \ agg(spec['agg']). \ reset_index(). \ rename(index=str, columns={spec['select']: out_column}) with open(group_fpath, 'wb') as f: pickle.dump(gp, f) else: with open(group_fpath, 'rb') as f: gp = pickle.load(f) out = target_df.merge(gp, on=spec['groupby'], how='left') out = out[[out_column]].astype(np.float32) info(out[out_column].describe()) feather.write_dataframe(out, out_fname) info('wrote %s' % out_fname) del out gc.collect() print('done ', datetime.now())