Example #1
0
def _build_bins():
    bin_fname = os.path.join(CACHE, 'binip.pkl')
    if os.path.exists(bin_fname):
        with open(bin_fname, 'rb') as f:
            bins = pickle.load(f)
            return bins

    # build full ips from train + test_v0
    df = load_base('train')
    df = df.append(load_base('test_v0'))
    assert len(df) == TEST_ROWS_V0 + TRAIN_ROWS

    group = GROUP
    helper = 'is_attributed'
    out_column = 'binip'

    gp = df[group + [helper]].groupby(by=group)[helper] \
        .count() \
        .reset_index() \
        .rename(index=str, columns={helper: out_column})

    count = df[group].merge(gp, on=group, how='left')
    _, bins = pd.qcut(count[out_column],
                      BINS,
                      retbins=True,
                      labels=False,
                      duplicates='drop')

    info("==== bins:")
    info(bins)

    with open(bin_fname, 'wb') as f:
        pickle.dump(bins, f)

    return bins
Example #2
0
def process(kind):
    
    for group in [
        #['binip'], 
        ['binip', 'hour'],
        ['binip', 'app', 'channel'],
        ['binip', 'app', 'channel', 'hour'],
        ['binip', 'app', 'os'],          
        ['binip', 'app', 'os', 'hour'],
    ]:
        out_column = "x4_{}".format('_'.join(group))
        out_fname = os.path.join(CACHE, '{}_{}.feather'.format(kind, out_column))
        if os.path.exists(out_fname):
            continue
            
        info('loading base')
        df = load_base('train')
        
        binip = feather.read_dataframe(os.path.join(CACHE, '{}_binip.feather'.format('train')))
        df['binip'] = binip['binip'].values
        
        del binip
        gc.collect()
        
        
        source_cond = (df.day.isin([6, 7, 9])) & df.hour.isin([4,5,9,10,13,14])
        # source_cond = (df.day.isin([9])) & df.hour.isin([4,5,9,10,13,14])
        source_df = df[source_cond]
        target_df = df
        
        if kind == 'test':
            target_df = load_base('test')
            binip = feather.read_dataframe(os.path.join(CACHE, '{}_binip.feather'.format(kind)))
            target_df['binip'] = binip['binip'].values

            del binip
            gc.collect()
        
        info('source: %d, target: %d' %(len(source_df), len(target_df)))        

        print('preparing ', out_column, datetime.now())
        out = _prepare_x4(kind, source_df, target_df, group, out_column, np.float32)
        
        info(out[out_column].describe())

        feather.write_dataframe(out, out_fname)
        print('wrote ', out_fname)

        del out
        del df
        del source_df
        del target_df
        gc.collect()

        print('done ', datetime.now())
Example #3
0
def process(kind):

    for group in [
            # single features
        ['ip'],

            #['app'], ['device'], ['os'], ['channel'],

            # pairs
        ['app', 'channel'],
        ['app', 'os'],
        ['app', 'device'],

            #triple
        ['app', 'channel', 'hour'],
        ['app', 'os', 'hour']
    ]:
        out_column = "x1_{}".format('_'.join(group))
        out_fname = os.path.join(CACHE,
                                 '{}_{}.feather'.format(kind, out_column))
        if os.path.exists(out_fname):
            continue

        info('loading train base')
        df = load_base('train')

        #controlled leak ^-^
        #source_cond = (df.day.isin([8, 9])) & df.hour.isin([4,5,9,10,13,14])
        source_cond = (df.day.isin([9])) & df.hour.isin([4, 5, 9, 10, 13, 14])
        source_df = df[source_cond]

        target_df = df if kind == 'train' else load_base('test')
        info('source: %d, target: %d' % (len(source_df), len(target_df)))

        print('preparing ', out_column, datetime.now())
        out = _prepare_x1(kind, source_df, target_df, group, out_column,
                          np.float32)

        info(out[out_column].describe())

        feather.write_dataframe(out, out_fname)
        print('wrote ', out_fname)

        del out
        del df
        del source_df
        del target_df
        gc.collect()

        print('done ', datetime.now())
Example #4
0
def process(kind):
    out_column = 'binip'
    out_fname = os.path.join(CACHE, '{}_{}.feather'.format(kind, out_column))
    if os.path.exists(out_fname):
        info('%s exists, skippping.' % out_fname)
        return

    info("preparing bins")
    bins = _build_bins()
    gc.collect()

    info("reading df")
    df = load_base(kind)

    info('preparing %s' % out_column)
    out = _prepare_binip(bins, df, out_column, kind)

    feather.write_dataframe(out, out_fname)
    print('wrote ', out_fname)

    del out
    del bins
    del df
    gc.collect()

    print('done ', datetime.now())
Example #5
0
def process(kind):    
    """
        these ones did not improve val score.
        #['ip', 'day', 'in_test_hh'],
        #['ip', 'day', 'app', 'in_test_hh'],
        #['ip', 'day', 'device', 'in_test_hh'],
        #['app', 'day', 'in_test_hh'],
        #['channel', 'day', 'in_test_hh'],
        #['binip', 'os', 'hour']
    """
    for group in [
        #['ip', 'day', 'hour'],
        ['ip', 'os', 'hour'],
        ['ip', 'app',  'hour'],            
        ['ip', 'device', 'hour'],
        ['ip', 'app', 'channel', 'hour'],
        #['ip', 'day', 'app', 'hour'],
        ]:   
        
        out_column = 'count_{}'.format('_'.join(group))
        out_fname = os.path.join(CACHE, '{}_{}.feather'.format(kind, out_column))
        if os.path.exists(out_fname):
            info("%s exists, skipping." % out_fname)
            continue
        
        df = load_base(kind)

        source_df = df
        source_cond = df.hour.isin([4,5,9,10,13,14])
        if kind == 'train':
            source_cond = source_cond & df.day.isin([9])
        source_df = df[source_cond]
        target_df = df
        
        info('source: %d, target: %d' %(len(source_df), len(target_df)))

        info('preparing %s %s' % (out_column, datetime.now()))
        out = _prepare_count(kind, source_df, target_df, group, out_column, np.float32)

        info(out.info())
        info(out[out_column].describe())

        feather.write_dataframe(out, out_fname)
        info('wrote %s' % out_fname)

        del out
        del source_df
        del target_df
        gc.collect()

        info('done %s' % datetime.now())
Example #6
0
def process(kind):

    GROUPBY_AGGREGATIONS = [
        #{'groupby': ['ip','app','channel'], 'select': 'day', 'agg': 'var'},
        {
            'groupby': ['ip', 'app', 'os'],
            'select': 'hour',
            'agg': 'var'
        },
        #{'groupby': ['ip','day','channel'], 'select': 'hour', 'agg': 'var'},
        #{'groupby': ['ip','day','hour'], 'select': 'channel', 'agg': 'count'},
        {
            'groupby': ['ip', 'app'],
            'select': 'channel',
            'agg': 'count'
        },
        {
            'groupby': ['ip', 'app', 'os'],
            'select': 'channel',
            'agg': 'count'
        },
        #{'groupby': ['ip','app','day','hour'], 'select': 'channel', 'agg': 'count'},
        {
            'groupby': ['ip', 'app', 'channel'],
            'select': 'hour',
            'agg': 'mean'
        },

        #{'groupby': ['app'],
        # 'select': 'ip',
        # 'agg': lambda x: float(len(x)) / len(x.unique()),
        # 'agg_name': 'AvgViewPerDistinct'
        #},
        {
            'groupby': ['app'],
            'select': 'channel',
            'agg': 'count'
        },
        #{'groupby': ['channel'], 'select': 'app', 'agg': 'count'}
    ]

    # Apply all the groupby transformations
    for spec in GROUPBY_AGGREGATIONS:
        # Name of the aggregation we're applying
        agg_name = spec['agg_name'] if 'agg_name' in spec else spec['agg']

        new_feature = '{}_{}_{}'.format('_'.join(spec['groupby']), agg_name,
                                        spec['select'])
        out_column = "x2_{}".format(new_feature)
        out_fname = os.path.join(CACHE,
                                 '{}_{}.feather'.format(kind, out_column))
        if os.path.exists(out_fname):
            continue

        df = load_base('train')

        source_cond = (df.day.isin([9])) & df.hour.isin([4, 5, 9, 10, 13, 14])
        source_df = df[source_cond]

        target_df = df if kind == 'train' else load_base('test')
        info('source: %d, target: %d' % (len(source_df), len(target_df)))

        # Info
        print("Grouping by {}, and aggregating {} with {}".format(
            spec['groupby'], spec['select'], agg_name))

        # Unique list of features to select
        all_features = list(set(spec['groupby'] + [spec['select']]))

        # ...
        group_fname = 'x2_{}'.format('_'.join(new_feature))
        group_fpath = os.path.join(CACHE, group_fname)

        if kind == 'train':
            print('preparing ', out_column, datetime.now())

            # Perform the groupby
            gp = source_df[all_features]. \
                groupby(spec['groupby'])[spec['select']]. \
                agg(spec['agg']). \
                reset_index(). \
                rename(index=str, columns={spec['select']: out_column})

            with open(group_fpath, 'wb') as f:
                pickle.dump(gp, f)
        else:
            with open(group_fpath, 'rb') as f:
                gp = pickle.load(f)

        out = target_df.merge(gp, on=spec['groupby'], how='left')
        out = out[[out_column]].astype(np.float32)

        info(out[out_column].describe())

        feather.write_dataframe(out, out_fname)
        info('wrote %s' % out_fname)

        del out
        gc.collect()

        print('done ', datetime.now())