Ejemplo n.º 1
0
def main():

    meta = pd.read_csv(RAW + 'item_metadata.csv')
    meta = featurize(meta, 'properties')
    meta['rating'] = count_rating(meta)
    meta['stars'] = count_stars(meta)
    meta['features'] = meta['list'].apply(len)

    ensure_dir(PREP)
    meta.to_csv(PREP + 'item_metadata.csv')
Ejemplo n.º 2
0
def main():

    keep = [
        'user_id', 'session_id', 'timestamp', 'step', 'item_recommendations'
    ]

    solution = load_csv(BASE_PATH + SET + 'solution_' + KEY +
                        '.csv').sort_values(['session_id'])
    solution['recommendations'] = solution['recommendations'].apply(
        lambda x: eval(
            x.replace('\n', '').replace('\r', '').replace(' ', ',').replace(
                ',,', ',').replace(',,', ',').replace(',,', ',').replace(
                    '[,', '[')))
    solution['confidences'] = solution['confidences'].apply(lambda x: eval(
        x.replace('\n', '').replace('\r', '').replace(' ', ',').replace(
            ',,', ',').replace(',,', ',').replace(',,', ',').replace(
                ',,', ',').replace('[,', '[')))

    example = load_csv(BASE_PATH + RAW + 'submission_popular.csv')
    umap, smap = load_maps(BASE_PATH + PREPROCESS)

    solution['item_recommendations'] = solution[
        'recommendations']  #.apply( lambda x: eval(x.replace('\n','').replace('\r','').replace(' ',',').replace(',,',',').replace(',,',',').replace(',,',',').replace('[,','[')) )
    del solution['recommendations']

    def okay(s):
        if type(s) == int:
            print(s)
        if type(s) == str:
            print(s)
        return ' '.join(map(str, s))

    print(solution['item_recommendations'])
    solution['item_recommendations'] = solution['item_recommendations'].apply(
        okay)
    print(solution['item_recommendations'])
    solution['user_id'] = solution['user_id'].apply(lambda x: umap[x])
    solution['session_id'] = solution['session_id'].apply(lambda x: smap[x])
    solution = solution[keep]

    if check(solution, example):
        ensure_dir(BASE_PATH + OUT)
        solution.to_csv(BASE_PATH + OUT + 'sub_' + KEY + '.csv')
Ejemplo n.º 3
0
def clean_and_map(data):

    #map cats
    data = data.sort_values(['user_id', 'timestamp',
                             'step']).reset_index(drop=True)
    data = fix_sessions(data)
    data = fix_sessions_duplicate(data)
    ok = check_sessions(data)

    if not ok:
        print('check data')
        exit()

    data['item_id'] = pd.to_numeric(data.reference,
                                    errors='coerce').fillna(-1).astype(int)
    data = map_strings(data)
    #check_sessions(data)
    ensure_dir(PATH_PROCESSED)
    data.to_feather(PATH_PROCESSED + 'joined_tmp.fthr')

    return data
Ejemplo n.º 4
0
def main():

    train = create_set(base_path=BASE_PATH + SET,
                       conf=CONF,
                       key=DSKEY,
                       redo=False)
    len_test = (len(train[train.train == 0]))
    train.query('train == 1', inplace=True)

    indices = train_test_cv(train, train.label, splits=SPLITS, shuffle=SHUFFLE)

    del train
    gc.collect()

    score = np.zeros((SPLITS, len_test))
    i = 0

    for train_idx, val_idx in indices:

        dataset_file_key = DSKEY + 'SPLIT' + str(i) + 'of' + str(
            SPLITS) + 'SHFL' + str(SHUFFLE)
        ensure_dir(BASE_PATH + SET + 'tmp/')
        file = BASE_PATH + SET + 'tmp/' + dataset_file_key + '_train.lgbm'

        if not Path(file).is_file():

            tstart = time.time()

            train = create_set(base_path=BASE_PATH + SET,
                               conf=CONF,
                               key=DSKEY,
                               redo=False)

            print('loaded in {}'.format((time.time() - tstart)))
            tstart = time.time()

            train.query('train == 1', inplace=True)

            file = BASE_PATH + SET + 'tmp/' + dataset_file_key + '_train.csv'
            #dump_svmlight_file( X_train[KEEP_FEATURES], y_train, file, zero_based=True, multilabel=False )
            train.loc[train_idx, ['label'] + FEATURES].to_csv(file,
                                                              index=False,
                                                              header=False)
            #num_cols = len(X_train.columns)
            q_train = train.loc[train_idx].groupby('session_id').size().values

            d_train = lgbm.Dataset(file)  #, categorical_feature=CAT_FEATURES )
            d_train.set_group(q_train)
            d_train.set_label(train.loc[train_idx, 'label'])
            d_train.set_feature_name(FEATURES)

            #del q_train
            gc.collect()

            file = BASE_PATH + SET + 'tmp/' + dataset_file_key + '_valid.csv'
            #dump_svmlight_file( X_valid[KEEP_FEATURES], y_valid, file, zero_based=True, multilabel=False )
            train.loc[val_idx, ['label'] + FEATURES].to_csv(file,
                                                            index=False,
                                                            header=False)

            q_valid = train.loc[val_idx].groupby('session_id').size().values
            y_val = train.loc[val_idx, 'label']
            del train
            gc.collect()

            d_valid = d_train.create_valid(
                file)  #, categorical_feature=CAT_FEATURES )
            file = BASE_PATH + SET + 'tmp/' + dataset_file_key + '_valid.lgbm'
            d_valid.set_group(q_valid)
            d_valid.set_label(y_val)
            d_valid.set_feature_name(FEATURES)
            d_valid.save_binary(file)
            file = BASE_PATH + SET + 'tmp/' + dataset_file_key + '_train.lgbm'
            d_train.save_binary(file)
            #del q_valid, d_valid, d_train
            gc.collect()

            file = BASE_PATH + SET + 'tmp/' + dataset_file_key + '_train.lgbm'
            d_train = lgbm.Dataset(file)
            file = BASE_PATH + SET + 'tmp/' + dataset_file_key + '_valid.lgbm'
            d_valid = lgbm.Dataset(file)

        else:

            tstart = time.time()

            print('load binary lgbm sets')
            file = BASE_PATH + SET + 'tmp/' + dataset_file_key + '_train.lgbm'
            d_train = lgbm.Dataset(file)
            file = BASE_PATH + SET + 'tmp/' + dataset_file_key + '_valid.lgbm'
            d_valid = lgbm.Dataset(file)

            print('loaded sets in {}'.format((time.time() - tstart)))

        watchlist = [d_train, d_valid]

        params = {}
        params['boosting'] = 'dart'
        params['learning_rate'] = 0.1
        params['application'] = 'lambdarank'
        params['metric'] = 'ndcg'
        params['eval_at'] = '30'
        #params['max_depth'] = -1
        #params['num_leaves'] = 64
        #params['max_bin'] = 512
        params['feature_fraction'] = 0.5
        params['bagging_fraction'] = 0.5
        #params['min_data_in_leaf'] = 20
        #params['verbosity'] = 0

        evals_result = {}
        model = lgbm.train(params,
                           train_set=d_train,
                           num_boost_round=MAX_EPOCHS,
                           valid_sets=watchlist,
                           early_stopping_rounds=STOPPING,
                           evals_result=evals_result,
                           verbose_eval=10)

        ensure_dir(BASE_PATH + SET + 'lgbm/')
        model.save_model(BASE_PATH + SET + 'lgbm/' + ALGKEY + '.' + str(i) +
                         '.txt',
                         num_iteration=model.best_iteration)

        del params, watchlist, d_train, d_valid, evals_result
        gc.collect()

        test = create_set(base_path=BASE_PATH + SET,
                          conf=CONF,
                          key=DSKEY,
                          redo=False)
        test.query('train == 0', inplace=True)
        X_test = test[FEATURES].values.astype(np.float32)
        del test
        gc.collect()

        y_test = model.predict(X_test, num_iteration=model.best_iteration)
        score[i] = y_test
        i += 1

        del y_test, model, X_test
        gc.collect()

    test = create_set(base_path=BASE_PATH + SET,
                      conf=CONF,
                      key=DSKEY,
                      redo=False)
    test.query('train == 0', inplace=True)

    test['prob_norm'] = 0
    test['prob_direct'] = 0
    for i in range(SPLITS):
        test['prob_direct_' + str(i)] = score[i]
        test['prob_norm' + str(i)] = (test['prob_direct_' + str(i)] -
                                      test['prob_direct_' + str(i)].min()) / (
                                          test['prob_direct_' + str(i)].max() -
                                          test['prob_direct_' + str(i)].min())
        test['prob_direct'] += test['prob_direct_' + str(i)]
        test['prob_norm'] += test['prob_norm' + str(i)]

    test['prob_norm'] = test['prob_norm'] / SPLITS
    test['prob_direct'] = test['prob_direct'] / SPLITS

    #truth = pd.read_csv( self.folder + 'truth.csv' )
    #truth['label2'] = 1
    #test = test.merge( truth[['session_id','reference','label2']], left_on=['session_id','impressions'], right_on=['session_id','reference'], how='left' )
    #test['label'] =  test['label2'].fillna(0)
    #del test['label2']

    test = test.sort_values(['session_id', 'prob_norm'], ascending=False)
    #test.to_csv( BASE_PATH + SET + 'test_debugcv.csv' )

    solution = pd.DataFrame()
    solution['recommendations'] = test.groupby('session_id').impressions.apply(
        list)
    solution['confidences'] = test.groupby('session_id').prob_norm.apply(list)
    solution.reset_index(drop=True)
    solution = solution.merge(
        test[['session_id', 'user_id', 'timestamp',
              'step']].drop_duplicates(keep='last'),
        on='session_id',
        how='inner')
    solution.to_csv(BASE_PATH + '/' + SET + '/solution_' + ALGKEY +
                    '_norm.csv')

    test = test.sort_values(['session_id', 'prob_direct'], ascending=False)
    solution = pd.DataFrame()
    solution['recommendations'] = test.groupby('session_id').impressions.apply(
        list)
    solution['confidences'] = test.groupby('session_id').prob_direct.apply(
        list)
    solution.reset_index(drop=True)
    solution = solution.merge(
        test[['session_id', 'user_id', 'timestamp',
              'step']].drop_duplicates(keep='last'),
        on='session_id',
        how='inner')
    solution.to_csv(BASE_PATH + '/' + SET + '/solution_' + ALGKEY +
                    '_direct.csv')

    result = evaluate(solution, base=BASE_PATH, dataset=SET)
    print(result.T)
Ejemplo n.º 5
0
def main():

    train = create_set(base_path=BASE_PATH + SET,
                       conf=CONF,
                       key=DSKEY,
                       redo=False)
    test = train.query('train == 0')

    test_file_key = DSKEY
    ensure_dir(BASE_PATH + SET + 'tmp/')
    test_file = BASE_PATH + SET + 'tmp/' + test_file_key + '_test.fthr'

    if not Path(test_file).is_file():
        test = test.reset_index(drop=True)
        test.to_feather(test_file)

    test_len = len(test)
    del test
    gc.collect()

    train.query('train == 1', inplace=True)

    X = train[FEATURES + ['session_id']]
    y = train['label']

    del train
    gc.collect()

    score = np.zeros((SPLITS, test_len))
    i = 0

    for train_idx, val_idx in train_test_cv(X,
                                            y,
                                            splits=SPLITS,
                                            shuffle=SHUFFLE):

        X_train = X.loc[train_idx]
        X_valid = X.loc[val_idx]
        y_train = y.loc[train_idx]
        y_valid = y.loc[val_idx]

        if LTR:
            q_train = X_train.groupby(['session_id'
                                       ]).size().values.astype(np.float32)
            q_valid = X_valid.groupby(['session_id'
                                       ]).size().values.astype(np.float32)
            xtrain = X_train[FEATURES].values.astype(np.float32)
            ytrain = y_train.values.astype(np.float32)
            del X_train, y_train
            gc.collect()
            d_train = lgbm.Dataset(
                xtrain, label=ytrain, group=q_train,
                feature_name=FEATURES)  #, categorical_feature=CAT_FEATURES )
            del q_train
            gc.collect()
            xval = X_valid[FEATURES].values.astype(np.float32)
            yval = y_valid.values.astype(np.float32)
            del X_valid, y_valid
            gc.collect()
            d_valid = lgbm.Dataset(
                xval, label=yval, group=q_valid,
                feature_name=FEATURES)  #, categorical_feature=CAT_FEATURES )
            del q_valid
            gc.collect()
        else:
            d_train = lgbm.Dataset(
                X_train[FEATURES], label=y_train, feature_name=FEATURES
            )  #+ ['session_id'])#, categorical_feature=CAT_FEATURES )
            d_valid = lgbm.Dataset(
                X_valid[FEATURES], label=y_valid, feature_name=FEATURES
            )  #+ ['session_id'])#, categorical_feature=CAT_FEATURES )

        watchlist = [d_train, d_valid]

        params = {}
        params['boosting'] = 'dart'
        params['learning_rate'] = 0.1
        if LTR:
            params['application'] = 'lambdarank'
            params['metric'] = 'ndcg'
            params['eval_at'] = '30'
        else:
            params['application'] = 'binary'
            params['metric'] = 'binary_logloss'
        #params['max_depth'] = -1
        #params['num_leaves'] = 64
        #params['max_bin'] = 512
        params['feature_fraction'] = 0.5
        params['bagging_fraction'] = 0.5
        #params['min_data_in_leaf'] = 20
        #params['verbosity'] = 0

        evals_result = {}
        model = lgbm.train(params,
                           train_set=d_train,
                           num_boost_round=MAX_EPOCHS,
                           valid_sets=watchlist,
                           early_stopping_rounds=STOPPING,
                           evals_result=evals_result,
                           verbose_eval=10)

        ensure_dir(BASE_PATH + SET + 'lgbm/')
        model.save_model(
            BASE_PATH + SET + 'lgbm/' + ALGKEY + '.' + str(i) + '.txt',
            num_iteration=model.best_iteration,
        )

        del params, watchlist, d_train, d_valid, evals_result
        gc.collect()

        test = load_feather(test_file)

        X_test = test[FEATURES].values.astype(np.float32)

        y_test = model.predict(X_test, num_iteration=model.best_iteration)
        score[i] = y_test
        i += 1

        del y_test, model, X_test, test
        gc.collect()

    test = load_feather(test_file)

    test['prob_norm'] = 0
    test['prob_direct'] = 0
    for i in range(SPLITS):
        test['prob_direct_' + str(i)] = score[i]
        test['prob_norm' + str(i)] = (test['prob_direct_' + str(i)] -
                                      test['prob_direct_' + str(i)].min()) / (
                                          test['prob_direct_' + str(i)].max() -
                                          test['prob_direct_' + str(i)].min())
        test['prob_direct'] += test['prob_direct_' + str(i)]
        test['prob_norm'] += test['prob_norm' + str(i)]

    test['prob_norm'] = test['prob_norm'] / SPLITS
    test['prob_direct'] = test['prob_direct'] / SPLITS

    #truth = pd.read_csv( self.folder + 'truth.csv' )
    #truth['label2'] = 1
    #test = test.merge( truth[['session_id','reference','label2']], left_on=['session_id','impressions'], right_on=['session_id','reference'], how='left' )
    #test['label'] =  test['label2'].fillna(0)
    #del test['label2']

    test = test.sort_values(['session_id', 'prob_norm'], ascending=False)
    #test.to_csv( BASE_PATH + SET + 'test_debugcv.csv' )

    solution = pd.DataFrame()
    solution['recommendations'] = test.groupby('session_id').impressions.apply(
        list)
    solution['confidences'] = test.groupby('session_id').prob_norm.apply(list)
    solution.reset_index(drop=True)
    solution = solution.merge(
        test[['session_id', 'user_id', 'timestamp',
              'step']].drop_duplicates(keep='last'),
        on='session_id',
        how='inner')
    solution.to_csv(BASE_PATH + '/' + SET + '/solution_' + ALGKEY +
                    '_norm.csv')

    test = test.sort_values(['session_id', 'prob_direct'], ascending=False)
    solution = pd.DataFrame()
    solution['recommendations'] = test.groupby('session_id').impressions.apply(
        list)
    solution['confidences'] = test.groupby('session_id').prob_direct.apply(
        list)
    solution.reset_index(drop=True)
    solution = solution.merge(
        test[['session_id', 'user_id', 'timestamp',
              'step']].drop_duplicates(keep='last'),
        on='session_id',
        how='inner')
    solution.to_csv(BASE_PATH + '/' + SET + '/solution_' + ALGKEY +
                    '_direct.csv')

    result = evaluate(solution, base=BASE_PATH, dataset=SET)
    print(result.T)
Ejemplo n.º 6
0
def crawl(folder, url):

    items = get_items(BASE_PATH + RAW_FOLDER)
    items_done = get_processed(folder)

    items = list(filter(lambda x: x not in items_done, items))

    result_map = {}

    first = items[0]

    togo = len(items)
    tstart = time.time()

    try:

        retries = RETRY
        problem = False
        dump = DUMP_AFTER

        i = 0

        while not problem:

            if i == len(items):
                break

            try:

                item = int(items[i])
                result_map[item] = get_item(url, item)
                #time.sleep(1)
                i += 1
                retries = RETRY
                togo -= 1
                dump -= 1

                if dump == 0:
                    ensure_dir(folder)
                    pickle.dump(
                        result_map,
                        open(folder + 'from_' + str(first) + '.pkl', 'wb'))
                    dump = DUMP_AFTER
                    first = item
                    result_map = {}

                if togo % 100 == 0:
                    spent = time.time() - tstart
                    done = i + 1
                    each = spent / done
                    left = each * togo
                    eta = datetime.timedelta(seconds=left)
                    spent = datetime.timedelta(seconds=spent)

                    print('done {} of {} in {}, {} left'.format(
                        done, len(items), spent, eta))

            except Exception:
                retries -= 1
                print('retries ', retries)
                if retries <= 0:
                    raise
                wait = RETRY - retries + 1
                time.sleep(pow(2, wait))

    except Exception:
        exc_type, exc_value, exc_traceback = sys.exc_info()
        print("*** print_tb:")
        traceback.print_tb(exc_traceback)
        ensure_dir(folder)
        pickle.dump(result_map,
                    open(folder + 'from_' + str(first) + '.pkl', 'wb'))

    ensure_dir(folder)
    pickle.dump(result_map, open(folder + 'from_' + str(first) + '.pkl', 'wb'))
Ejemplo n.º 7
0
def create_latent_factors( full, size=32, actions=None, values=None, key='all', method='bpr' ):
    
    start = time.time()
    
    full = full[full.action_type.isin(actions)]
    
    if IMPRESSION in actions:
        full = extend_clicks( full )
    
    full = full.drop_duplicates( ['session_id','reference','action_type'], keep='last' )
    full = full[~full.reference.isnull() & (full.exclude == 0)]
        
    items = full['reference'].unique()
    item_map = pd.Series( index=items, data=range(len(items)) )
    full['item_idx'] = full['reference'].map( item_map )
    
    sessions = full['session_id'].unique()
    session_map = pd.Series( index=sessions, data=range(len(sessions)) )
    full['session_idx'] = full['session_id'].map( session_map )
    
    full['value'] = 1
    for i,action in enumerate(actions):
        full.ix[full.action_type == action, 'value'] = values[i]
    
    full['value'] = full.groupby( ['session_id','reference'] ).value.transform(max)
    full = full.drop_duplicates( ['session_id','reference','action_type'], keep='last' )
    
    SPM = sparse.csr_matrix(( full['value'].tolist(), (full.item_idx, full.session_idx)), shape=( full.item_idx.nunique(), full.session_idx.nunique() ))
    
    print( 'created user features in ',(time.time() - start) )
    
    start = time.time()
    
    start = time.time()
    
    if method == 'bpr':
        model = implicit.bpr.BayesianPersonalizedRanking( factors=size-1, iterations=200, use_gpu=False )
        model.fit(SPM)
        If = model.item_factors
        Sf =  model.user_factors
    elif method == 'als':
        model = implicit.als.AlternatingLeastSquares( factors=size, iterations=200, use_gpu=False, calculate_training_loss=True )
        model.fit(SPM)
        If = model.item_factors
        Sf =  model.user_factors
    elif method == 'nmf':
        nmf = dc.NMF(n_components=size, init='random', random_state=0, max_iter=500, verbose=1)
        If = nmf.fit_transform( SPM )
        Sf = nmf.components_.T

    # train the model on a sparse matrix of item/user/confidence weights

    IF = ['if_'+str(i) for i in range(size)]
    SF = ['sf_'+str(i) for i in range(size)]
    
    Sf = pd.DataFrame( Sf, index=full.session_idx.unique() )
    Sf.columns = SF
    Sf['session_id'] = session_map.index
    If = pd.DataFrame( If, index=full.item_idx.unique() )
    If.columns = IF
    If['item_id'] = item_map.index
    
    item_emb = If.sort_values('item_id')[IF].values
    session_emb = Sf.sort_values('session_id')[SF].values
    
    ensure_dir( DATA_FOLDER + 'latent/' )
    If.to_csv( DATA_FOLDER + 'latent/' + method + 'c_'+key+'_item_features.'+str(size)+'.csv', index=False)
    Sf.to_csv( DATA_FOLDER + 'latent/' + method + 'c_'+key+'_session_features.'+str(size)+'.csv', index=False)
    
    print('created latent features in ',(time.time() - start))
    
    res = []
    
    for row in full.itertuples():
        session = session_emb[int(row.session_idx)]
        item = item_emb[row.item_idx]
        
        res.append( np.dot( item, session.T ) )
        
    full['reconst'] = res
    
    print( full[['item_idx','session_idx','value','reconst']] )
Ejemplo n.º 8
0
Archivo: price.py Proyecto: rn5l/rsc19
def create_features(base_path,
                    log,
                    examples,
                    min_occurences=None,
                    hidden=False,
                    train_only=False,
                    fillna_mean=False):

    tstart = time.time()
    print('create_features price')

    cols_pre = examples.columns.values

    mask_log = log.hidden > -1
    mask_examples = examples.train > -1
    if train_only:
        mask_log = mask_log & (log.train == 1)
        mask_examples = mask_examples & (examples.train == 1)
    if not hidden:
        mask_log = mask_log & (log.hidden < 1)

    clicks = log[log.action_type == CLICK][[
        'train', 'session_id', 'prices', 'impressions', 'city', 'platform'
    ]].copy()
    clicks = expand(clicks, ['impressions', 'prices'])
    clicks = clicks.drop_duplicates(['session_id', 'impressions'], keep='last')
    mask_clicks = clicks.train > -1 if not train_only else clicks.train == 1

    ensure_dir(base_path + 'tmp/')
    clicks.groupby('city').prices.mean().to_csv(base_path + 'tmp/' +
                                                'city_price.csv')

    examples = price_by_group_imp(examples,
                                  clicks,
                                  mask_clicks,
                                  group=['impressions'],
                                  key='item',
                                  min_occurences=min_occurences,
                                  fillna_mean=fillna_mean)
    examples = price_by_group_imp(examples,
                                  clicks,
                                  mask_clicks,
                                  group=['city'],
                                  key='city',
                                  min_occurences=min_occurences,
                                  fillna_mean=fillna_mean)
    examples = price_by_group_imp(examples,
                                  clicks,
                                  mask_clicks,
                                  group=['platform'],
                                  key='platform',
                                  min_occurences=min_occurences,
                                  fillna_mean=fillna_mean)
    examples = price_by_group_imp(examples,
                                  clicks,
                                  mask_clicks,
                                  group=['city', 'platform'],
                                  key='city_platform',
                                  min_occurences=min_occurences,
                                  fillna_mean=fillna_mean)
    examples = price_by_group_imp(examples,
                                  examples,
                                  mask_examples,
                                  group=['session_id'],
                                  key='list',
                                  min_occurences=min_occurences,
                                  fillna_mean=fillna_mean)
    del clicks
    gc.collect()

    #     clickprice = pd.DataFrame()
    #     clickprice['prices_click'] = examples[(examples.train==1) & (examples.label==1)].groupby( 'session_id' ).prices.min()
    #     examples = examples.merge( clickprice, left_on='session_id', right_index=True, how='left' )
    #     del clickprice

    print(sum(mask_log))
    examples = price_by_group_action(log,
                                     examples,
                                     mask_log,
                                     group=['item_id'],
                                     group_examples=['impressions'],
                                     key='item',
                                     hidden=hidden,
                                     min_occurences=min_occurences,
                                     fillna_mean=fillna_mean)
    examples = price_by_group_action(log,
                                     examples,
                                     mask_log,
                                     group=['platform'],
                                     key='platform',
                                     hidden=hidden,
                                     min_occurences=min_occurences,
                                     fillna_mean=fillna_mean)
    #examples = price_by_group_action(log, examples, mask_log, group=['device'], key='device', hidden=hidden, min_occurences=min_occurences, fillna_mean=fillna_mean)
    examples = price_by_group_action(log,
                                     examples,
                                     mask_log,
                                     group=['city'],
                                     key='city',
                                     hidden=hidden,
                                     min_occurences=min_occurences,
                                     fillna_mean=fillna_mean)
    examples = price_by_group_action(log,
                                     examples,
                                     mask_log,
                                     group=['city', 'platform'],
                                     key='city_platform',
                                     hidden=hidden,
                                     min_occurences=min_occurences,
                                     fillna_mean=fillna_mean)

    #     del examples['prices_click']

    new_cols = np.setdiff1d(examples.columns.values, cols_pre)

    print('create_features price in {}s'.format((time.time() - tstart)))

    return examples, new_cols
Ejemplo n.º 9
0
def main():

    tstart = time.time()

    train = create_set(base_path=BASE_PATH + SET,
                       conf=CONF,
                       key=DSKEY,
                       redo=False)
    #train = resolve_na(train)

    print('loaded in {}'.format((time.time() - tstart)))
    tstart = time.time()

    #test = train.query('train == 0')
    train.query('train == 1', inplace=True)

    print('split in {}'.format((time.time() - tstart)))
    tstart = time.time()

    if FS_IMP is not None:
        FEATURES_IMP = get_features_by_importance(FS_IMP)
    else:
        FEATURES_IMP = FEATURES

    print([item for item, count in Counter(FEATURES).items() if count > 1])

    y = train['label']
    X = train[FEATURES_IMP + ['session_id']]

    #input("Press Enter to continue...")

    print('FEATURES in in {}'.format((time.time() - tstart)))
    tstart = time.time()

    if STACK:
        train_stack = train[[
            'user_id', 'session_id', 'step', 'timestamp', 'impressions'
        ]].copy()
    del train
    gc.collect()

    print('gc collect in in {}'.format((time.time() - tstart)))
    tstart = time.time()

    if FS != None:
        check_cols(X)
        keep = feature_selection(X[FEATURES_IMP], y, FS)
        KEEP_FEATURES = [FEATURES_IMP[i] for i in keep]
    else:
        KEEP_FEATURES = FEATURES_IMP

    X_train, X_valid, y_train, y_valid = train_test_split(X,
                                                          y,
                                                          test_size=VALID,
                                                          shuffle=SHUFFLE)

    print('split in in {}'.format((time.time() - tstart)))
    tstart = time.time()

    if LTR:
        q_train = X_train.groupby(['session_id'
                                   ]).size().values.astype(np.float32)
        q_valid = X_valid.groupby(['session_id'
                                   ]).size().values.astype(np.float32)
        xtrain = X_train[KEEP_FEATURES].values.astype(np.float32)
        ytrain = y_train.values.astype(np.float32)
        del X_train, y_train
        gc.collect()
        d_train = lgbm.Dataset(
            xtrain, label=ytrain, group=q_train,
            feature_name=KEEP_FEATURES)  #, categorical_feature=CAT_FEATURES )
        del q_train
        gc.collect()
        xval = X_valid[KEEP_FEATURES].values.astype(np.float32)
        yval = y_valid.values.astype(np.float32)
        del X_valid, y_valid
        gc.collect()
        d_valid = lgbm.Dataset(
            xval, label=yval, group=q_valid,
            feature_name=KEEP_FEATURES)  #, categorical_feature=CAT_FEATURES )
        del q_valid
        gc.collect()
    else:
        xtrain = X_train[KEEP_FEATURES].values.astype(np.float32)
        ytrain = y_train.values.astype(np.float32)
        del X_train, y_train
        gc.collect()
        d_train = lgbm.Dataset(
            xtrain, label=ytrain, feature_name=KEEP_FEATURES
        )  #+ ['session_id'])#, categorical_feature=CAT_FEATURES )
        del xtrain, ytrain
        gc.collect()
        xval = X_valid[KEEP_FEATURES].values.astype(np.float32)
        yval = y_valid.values.astype(np.float32)
        del X_valid, y_valid
        gc.collect()
        d_valid = lgbm.Dataset(
            xval, label=yval, feature_name=KEEP_FEATURES
        )  #+ ['session_id'])#, categorical_feature=CAT_FEATURES )
        del xval, yval
        gc.collect()

    print('create sets in {}'.format((time.time() - tstart)))
    tstart = time.time()

    watchlist = [d_train, d_valid]

    params = {}
    params['boosting'] = 'dart'
    params['learning_rate'] = 0.1
    if LTR:
        params['application'] = 'lambdarank'
        params['metric'] = 'ndcg'
        params['eval_at'] = '30'
        #params['group_column'] = 'name:session_id'
    else:
        params['application'] = 'binary'
        params['metric'] = 'binary_logloss'
#     params['max_depth'] = 34
#     params['num_leaves'] = 234
#     params['max_bin'] = 485
#     params['feature_fraction'] = 0.202505
#     params['bagging_fraction'] = 0.823505
#     params['min_data_in_leaf'] = 15
    params['feature_fraction'] = 0.5
    params['bagging_fraction'] = 0.5
    #params['bagging_freq'] = 5
    #params['verbosity'] = 0

    evals_result = {}
    model = lgbm.train(params,
                       train_set=d_train,
                       num_boost_round=10000,
                       valid_sets=watchlist,
                       early_stopping_rounds=STOPPING,
                       evals_result=evals_result,
                       verbose_eval=10)  #, feval=mrr )

    print('train in in {}'.format((time.time() - tstart)))
    tstart = time.time()

    #     ax = lgbm.plot_metric(evals_result, metric='auc')
    #     plt.show()

    export_importance(model,
                      KEEP_FEATURES,
                      export=FS is None and FS_IMP is None)

    ensure_dir(BASE_PATH + SET + 'lgbm/')
    model.save_model(BASE_PATH + SET + 'lgbm/' + ALGKEY + '.txt',
                     num_iteration=model.best_iteration)

    test = create_set(base_path=BASE_PATH + SET,
                      conf=CONF,
                      key=DSKEY,
                      redo=False)
    test.query('train == 0', inplace=True)

    X_test = test[KEEP_FEATURES]
    y_test = model.predict(X_test, num_iteration=model.best_iteration)

    print('predict in {}'.format((time.time() - tstart)))
    tstart = time.time()

    test['prob'] = y_test

    if STACK:
        test[[
            'user_id', 'session_id', 'step', 'timestamp', 'impressions', 'prob'
        ]].to_csv(BASE_PATH + '/' + SET + '/stacking/teprobs_' + ALGKEY +
                  '.csv')

        y_pred = model.predict(X[KEEP_FEATURES])
        train_stack['prob'] = y_pred
        train_stack[[
            'user_id', 'session_id', 'step', 'timestamp', 'impressions', 'prob'
        ]].to_csv(BASE_PATH + '/' + SET + '/stacking/trprobs_' + ALGKEY +
                  '.csv')


#     truth = pd.read_csv( BASE_PATH + SET + 'truth.csv' )
#     truth['label2'] = 1
#     test = test.merge( truth[['session_id','reference','label2']], left_on=['session_id','impressions'], right_on=['session_id','reference'], how='left' )
#     test['label'] =  test['label2'].fillna(0)
#     del test['label2']

    test = test.sort_values(['session_id', 'prob'], ascending=False)

    #     test.to_csv( BASE_PATH + SET + '/test_examine.csv' )

    solution = pd.DataFrame()
    solution['recommendations'] = test.groupby('session_id').impressions.apply(
        list)
    solution['confidences'] = test.groupby('session_id').prob.apply(list)
    solution.reset_index(drop=True)
    solution = solution.merge(
        test[['session_id', 'user_id', 'timestamp',
              'step']].drop_duplicates(keep='last'),
        on='session_id',
        how='inner')
    solution.to_csv(BASE_PATH + '/' + SET + '/solution_' + ALGKEY + '.csv')

    result = evaluate(solution, base=BASE_PATH, dataset=SET)
    print(result.T)
Ejemplo n.º 10
0
def create_latent_factors( full, size=32, actions=ACTIONS_CLICK, key=KEY_CLICK ):
    
    start = time.time()
    
    full = full[full.action_type.isin(actions)]
    
    full = full.drop_duplicates( ['session_id','reference','action_type'], keep='last' )
    full = full[~full.reference.isnull() & (full.exclude == 0)]
    
    items = set( full.reference.unique() )
    print( len(items) )
    
    lists = pd.DataFrame()
    lists['session_id'] = full.groupby('session_id').session_id.min()
    lists['sequence'] = full.groupby('session_id').reference.apply( list )
    del full
    
    sequences = []
    
    for row in lists.itertuples():
        props = [str(i) for i in row.sequence]
        sequences.append( TaggedDocument(words=props, tags=[str(row.session_id)]) )
    
    print( 'created sequences in ',(time.time() - start) )
    
    start = time.time()
    
    print('ITEM2VEC FEATURES')
    start = time.time()
    
    model = Doc2Vec(vector_size=size, window=5, min_count=1, workers=4)
    model.build_vocab(sequences)
    print('vocab build')
    
    for i in range(ITERATIONS):
        model.train(sequences, epochs=1, total_examples=model.corpus_count)
        print('trained {} in {}'.format( i, ( time.time() - start ) ))
    
    d = {}  
    for item in lists.session_id.values:
        d[str(item)] = model[str(item)]
    
    frame = pd.DataFrame( d )
    frame = frame.T
    frame.columns = ['sf_'+str(i) for i in range(size)]
    frame['session_id'] = pd.to_numeric( frame.index ).astype(np.int32)
    
    ensure_dir( DATA_FOLDER + 'latent/' )
    frame.to_csv( DATA_FOLDER + 'latent/' + 'd2v_'+key+'_session_features.'+str(size)+'.csv', index=False)
    
    d = {}  
    for item in items:
        d[str(item)] = model.wv[str(item)]
    
    frame = pd.DataFrame( d )
    frame = frame.T
    frame.columns = ['if_'+str(i) for i in range(size)]
    frame['item_id'] = pd.to_numeric( frame.index ).astype(np.int32)
    
    frame.to_csv( DATA_FOLDER + 'latent/' + 'd2v_'+key+'_item_features.'+str(size)+'.csv', index=False)
    
    print('created latent features in ',(time.time() - start))