Example #1
0
def create_dataset(mode, cluster):

    features_array = [
        ImpressionLabel, ImpressionPositionSession, ScoresRNN, ScoresXGB
    ]

    train_df, test_df, train_idxs, _ = merge_features(mode,
                                                      cluster,
                                                      features_array,
                                                      merge_kind='left')
    train_df = train_df.replace(-1, np.nan)
    test_df = test_df.replace(-1, np.nan)

    bp = 'dataset/preprocessed/{}/{}/stacking/'.format(cluster, mode)
    check_folder(bp)

    X_train = train_df.drop(
        ['index', 'user_id', 'session_id', 'item_id', 'label'], axis=1)
    X_train = X_train.to_sparse(fill_value=0)
    X_train = X_train.astype(np.float64)
    X_train = X_train.to_coo().tocsr()
    save_npz(join(bp, 'X_train'), X_train)
    print('X_train saved')

    y_train = train_df[['label']]
    y_train.to_csv(join(bp, 'y_train.csv'))
    print('y_train saved')

    group = create_groups(train_df)
    print(len(group))
    np.save(join(bp, 'group_train'), group)
    print('train groups saved')

    np.save(join(bp, 'train_indices'), train_idxs)

    print('train data completed')

    X_test = test_df.drop(
        ['index', 'user_id', 'session_id', 'item_id', 'label'], axis=1)

    # if mode == 'full':
    X_test = X_test.to_sparse(fill_value=0)
    X_test = X_test.astype(np.float64)
    X_test = X_test.to_coo().tocsr()
    save_npz(join(bp, 'X_test'), X_test)
    # else:
    #    X_test.to_csv(join(bp, 'X_test.csv'))
    print('X_test saved')

    y_test = test_df[['label']]
    y_test.to_csv(join(bp, 'y_test.csv'))
    print('y_test saved')

    group = create_groups(test_df)
    print(len(group))
    np.save(join(bp, 'group_test'), group)

    print('test groups saved')

    print('test data completed')
Example #2
0
def create_dataset(mode, cluster):
    # training
    features_array = [
        ActionsInvolvingImpressionSession, ImpressionLabel,
        ImpressionPriceInfoSession, TimingFromLastInteractionImpression,
        TimesUserInteractedWithImpression, ImpressionPositionSession,
        LastInteractionInvolvingImpression,
        TimesImpressionAppearedInClickoutsSession, MeanPriceClickout,
        SessionLength, TimeFromLastActionBeforeClk, FrenzyFactorSession,
        PricePositionInfoInteractedReferences, SessionDevice,
        SessionFilterActiveWhenClickout, SessionSortOrderWhenClickout,
        ImpressionFeature
    ]

    curr_dir = Path(__file__).absolute().parent
    data_dir = curr_dir.joinpath(
        '..', 'dataset/preprocessed/{}/{}/lightGBM/'.format(cluster, mode))
    print(data_dir)
    check_folder(str(data_dir))

    train_df, test_df = merge_features(mode, cluster, features_array)

    if os.path.isfile(str(data_dir) + '/svmlight_train.txt'):
        print('Train File già presente')
    else:
        to_queries_dataset(train_df,
                           path=str(data_dir) + '/svmlight_train.txt')

    if os.path.isfile(str(data_dir) + '/test.csv'):
        print('Test File già presente')
        #test_df.sort_values()
        to_queries_dataset(test_df, path=str(data_dir) + '/svmlight_test.txt')
    else:
        test_df.to_csv(str(data_dir) + '/test.csv', index=False)
        to_queries_dataset(test_df, path=str(data_dir) + '/svmlight_test.txt')
Example #3
0
def create_dataset(mode, cluster):
    features_array = [
        ImpressionPositionSession, ImpressionPriceInfoSessionOld,
        ImpressionRatingNumeric, ImpressionLabel,
        LastActionInvolvingImpression, MeanPriceClickout, AvgPriceInteractions,
        SessionDevice, NumImpressionsInClickout, SessionLengthOld,
        TimesImpressionAppearedInClickoutsSession,
        TimesUserInteractedWithImpression, TimingFromLastInteractionImpression,
        TopPopPerImpression, TopPopInteractionClickoutPerImpression,
        ChangeImpressionOrderPositionInSession, FrenzyFactorSession,
        DayOfWeekAndMomentInDay, LastClickoutFiltersSatisfaction,
        TimePerImpression, PersonalizedTopPop, PriceQuality,
        PlatformFeaturesSimilarity, LastActionBeforeClickout,
        ImpressionStarsNumeric, StepsBeforeLastClickout,
        LocationReferencePercentageOfClickouts,
        LocationReferencePercentageOfInteractions, NumTimesItemImpressed,
        PercClickPerImpressions, PlatformReferencePercentageOfClickouts,
        PlatformReferencePercentageOfInteractions, PlatformSession,
        User2ItemOld, LazyUser, PastFutureSessionFeatures,
        SessionSortOrderWhenClickout, SessionActionNumRefDiffFromImpressions,
        ActionsInvolvingImpressionSession, SessionNumClickouts
    ]

    curr_dir = Path(__file__).absolute().parent
    data_dir = curr_dir.joinpath(
        '..', 'dataset/preprocessed/{}/{}/catboost/'.format(cluster, mode))
    print(data_dir)
    check_folder(str(data_dir))

    train_df, test_df, _, __ = merge_features(
        mode,
        cluster,
        features_array,
        merge_kind='left',
        onehot=False,
        create_not_existing_features=True)

    train_df = train_df.fillna(-1)
    test_df = test_df.fillna(-1)

    train_df.to_csv(str(data_dir) + '/train.csv', index=False)
    #to_pool_dataset(train_df, path=str(data_dir) + '/catboost_train.txt')

    print('Train saved')
    test_df.to_csv(str(data_dir) + '/test.csv', index=False)
def create_dataset(mode, cluster, class_weights=False):
    # training
    kind = input('insert the kind: ')

    if kind == 'label':
        features_array = [ImpressionLabel]

    if kind == 'no_bias':
        features_array = [
        PlatformSession,
        PlatformFeaturesSimilarity,
        AvgPriceInteractions,
        ChangeImpressionOrderPositionInSession,
        DayOfWeekAndMomentInDay,
        FrenzyFactorSession,
        ImpressionPositionSession,
        ImpressionPriceInfoSessionOld,
        ImpressionRatingNumeric,
        ActionsInvolvingImpressionSession,
        SessionNumClickouts,
        ImpressionStarsNumeric,
        ImpressionLabel,
        LastActionInvolvingImpression,
        LastActionBeforeClickout,
        TimesImpressionAppearedInClickoutsSession,
        LastClickoutFiltersSatisfaction,
        StepsBeforeLastClickout,
        LazyUser,
        MeanPriceClickout,
        NumImpressionsInClickout,
        SessionLengthOld,
        NumTimesItemImpressed,
        PercClickPerImpressions,
        PlatformReferencePercentageOfClickouts,
        PriceQuality,
        SessionDevice,
        SessionSortOrderWhenClickout,
        TimePerImpression,
        TimesUserInteractedWithImpression,
        TimingFromLastInteractionImpression,
        TopPopInteractionClickoutPerImpression,
        TopPopPerImpression,
        User2ItemOld,
        ]

    if kind=='content':
        features_array = [
                ImpressionPriceInfoSessionOld,
                ImpressionRatingNumeric,
                ImpressionLabel,
                MeanPriceClickout,
                SessionDevice,
                NumImpressionsInClickout,
                SessionLengthOld,
                TopPopPerImpression,
                TopPopInteractionClickoutPerImpression,
                ChangeImpressionOrderPositionInSession,
                FrenzyFactorSession,
                DayOfWeekAndMomentInDay,
                LastClickoutFiltersSatisfaction,
                PersonalizedTopPop,
                PriceQuality,
                PlatformFeaturesSimilarity,
                LastActionBeforeClickout,
                ImpressionStarsNumeric,
                StepsBeforeLastClickout,
                LocationReferencePercentageOfClickouts,
                LocationReferencePercentageOfInteractions,
                NumTimesItemImpressed,
                PercClickPerImpressions,
                PlatformReferencePercentageOfClickouts,
                PlatformReferencePercentageOfInteractions,
                User2ItemOld,
                SessionSortOrderWhenClickout,
                SessionActionNumRefDiffFromImpressions,
                SessionNumClickouts
            ]

    if kind == 'session_filters':
        features_array = [SessionFilterActiveWhenClickout, ImpressionLabel]

    if kind=='impression_feature':
        features_array = [ImpressionFeatureCleaned, ImpressionLabel]

    if kind == 'kind2':
        features_array = [
        (LazyUser, False),
        PriceQuality,
        PlatformFeaturesSimilarity,
        PersonalizedTopPop,
        TimePerImpression,
        DayOfWeekAndMomentInDay,
        LastClickoutFiltersSatisfaction,
        FrenzyFactorSession,
        ChangeImpressionOrderPositionInSession,
        User2Item,
        PlatformSession,
        PlatformReferencePercentageOfInteractions,
        PercClickPerImpressions,
        PlatformReferencePercentageOfClickouts,
        NumImpressionsInClickout,
        NumTimesItemImpressed,
        LocationReferencePercentageOfClickouts,
        LocationReferencePercentageOfInteractions,
        StepsBeforeLastClickout,
        ImpressionStarsNumeric,
        LastActionBeforeClickout,
        TopPopPerImpression,
        TopPopInteractionClickoutPerImpression,
        ImpressionRatingNumeric,
        ActionsInvolvingImpressionSession,
        ImpressionLabel,
        ImpressionPriceInfoSession,
        TimingFromLastInteractionImpression,
        TimesUserInteractedWithImpression,
        ImpressionPositionSession,
        LastActionInvolvingImpression,
        SessionDevice,
        SessionSortOrderWhenClickout,
        MeanPriceClickout,
        PriceInfoSession,
        SessionLength,
        TimesImpressionAppearedInClickoutsSession]

    if kind == 'kind1':
        features_array = [
            ImpressionPositionSession,
            ImpressionPriceInfoSessionOld,
            ImpressionRatingNumeric,
            ImpressionLabel,
            LastActionInvolvingImpression,
            MeanPriceClickout,
            AvgPriceInteractions,
            SessionDevice,
            NumImpressionsInClickout,
            SessionLengthOld,
            TimesImpressionAppearedInClickoutsSession,
            TimesUserInteractedWithImpression,
            TimingFromLastInteractionImpression,
            TopPopPerImpression,
            TopPopInteractionClickoutPerImpression,
            ChangeImpressionOrderPositionInSession,
            FrenzyFactorSession,
            DayOfWeekAndMomentInDay,
            LastClickoutFiltersSatisfaction,
            TimePerImpression,
            PersonalizedTopPop,
            PriceQuality,
            PlatformFeaturesSimilarity,
            LastActionBeforeClickout,
            ImpressionStarsNumeric,
            StepsBeforeLastClickout,
            LocationReferencePercentageOfClickouts,
            LocationReferencePercentageOfInteractions,
            NumTimesItemImpressed,
            PercClickPerImpressions,
            PlatformReferencePercentageOfClickouts,
            PlatformReferencePercentageOfInteractions,
            PlatformSession,
            User2ItemOld,
            LazyUser,
            PastFutureSessionFeatures,
            SessionSortOrderWhenClickout,
            SessionActionNumRefDiffFromImpressions,
            ActionsInvolvingImpressionSession,
            SessionNumClickouts
        ]

    scores_array = [
        # 'rnn_classifier.csv.gz', 
        # 'rnn_no_bias_balanced.csv.gz',
        # 'scores_softmax_loss.csv.gz',
        # 'xgboost_impr_features.csv.gz',
        # 'rnn_GRU_2layers_64units_2dense_noclass0.csv.gz',
        # 'scores_pairwise_soft_zero_one_loss.csv.gz',
        # 'xgb_forte_700.csv.gz',
    ]

    train_df, test_df, train_idxs, _ = merge_features(mode, cluster, features_array, merge_kind='left', multithread=False)

    if len(scores_array) > 0:
        for path in scores_array:
            score = pd.read_csv('scores/{}'.format(path))

            if 'item_id' in score.columns:
                print('item_id found')
                cols = [c for c in score.columns if c in ['user_id', 'session_id', 'item_id'] or 'score' in c]
                score = score[cols]
                score = score.groupby(['user_id', 'session_id', 'item_id'], as_index=False).last()
                train_df = train_df.merge(score, on=['user_id', 'session_id', 'item_id'], how='left')
                test_df = test_df.merge(score, on=['user_id', 'session_id', 'item_id'], how='left')
                print(f'train_shape: {train_df.shape}\n vali_shape: {test_df.shape}')
            
            else:
                print('item_id not found')
                cols = [c for c in score.columns if c in ['user_id', 'session_id'] or 'score' in c]
                score = score[cols]
                score = score.groupby(['user_id', 'session_id'], as_index=False).last()
                train_df = train_df.merge(score, on=['user_id', 'session_id'], how='left')
                test_df = test_df.merge(score, on=['user_id', 'session_id'], how='left')
                print(f'train_shape: {train_df.shape}\n vali_shape: {test_df.shape}')

    train_df = train_df.replace(-1, np.nan)
    test_df = test_df.replace(-1, np.nan)

    bp = 'dataset/preprocessed/{}/{}/xgboost/{}/'.format(cluster, mode, kind)
    check_folder(bp)

    if class_weights:
        weights = train_df[['user_id', 'session_id',
                            'weights']].drop_duplicates().weights.values
        print(len(weights))
        np.save(join(bp, 'class_weights'), weights)
        print('class weights saved')

    if class_weights:
        X_train = train_df.drop(
            ['index', 'user_id', 'session_id', 'item_id', 'label', 'weights'], axis=1)
    else:
        X_train = train_df.drop(
            ['index', 'user_id', 'session_id', 'item_id', 'label'], axis=1)
    print(','.join(X_train.columns.values))
    X_train = X_train.to_sparse(fill_value=0)
    X_train = X_train.astype(np.float64)
    X_train = X_train.to_coo().tocsr()
    save_npz(join(bp, 'X_train'), X_train)
    print('X_train saved')

    user_session_item = train_df[['user_id', 'session_id', 'item_id']]
    user_session_item.to_csv(join(bp, 'user_session_item_train.csv'), index=False)

    y_train = train_df[['label']]
    y_train.to_csv(join(bp, 'y_train.csv'))
    print('y_train saved')

    group = create_groups(train_df)
    print(len(group))
    np.save(join(bp, 'group_train'), group)
    print('train groups saved')

    np.save(join(bp, 'train_indices'), train_idxs)

    print('train data completed')

    if class_weights:
        X_test = test_df.drop(
            ['index', 'user_id', 'session_id', 'item_id', 'label', 'weights'], axis=1)
    else:
        X_test = test_df.drop(
            ['index', 'user_id', 'session_id', 'item_id', 'label'], axis=1)

    X_test = X_test.to_sparse(fill_value=0)
    X_test = X_test.astype(np.float64)
    X_test = X_test.to_coo().tocsr()
    save_npz(join(bp, 'X_test'), X_test)
    print('X_test saved')

    user_session_item = test_df[['user_id', 'session_id', 'item_id']]
    user_session_item.to_csv(join(bp, 'user_session_item_test.csv'), index=False)

    y_test = test_df[['label']]
    y_test.to_csv(join(bp, 'y_test.csv'))
    print('y_test saved')

    group = create_groups(test_df)
    print(len(group))
    np.save(join(bp, 'group_test'), group)

    print('test groups saved')

    print('test data completed')
def create_dataset(mode, cluster, class_weights=False, weights_position=True, log_weights=True):
    # training
    kind = input('insert the kind: ')
    if cluster == 'no_cluster' or True:

        if kind == 'kind2':
            # questo fa 0.6755 in locale + NormalizedPlatformFeaturesSimilarity, SessionNumClickouts fa 0.67588
                features_array = [
                ImpressionLabel,
                PastFutureSessionFeatures
                ]
        if kind == 'kind3':
            # questo è quello che usa dani su cat
            features_array = [
                ActionsInvolvingImpressionSession,
                ImpressionPositionSession,
                ImpressionPriceInfoSessionOld,
                ImpressionRatingNumeric,
                ImpressionLabel,
                LastActionInvolvingImpression,
                MeanPriceClickout,
                AvgPriceInteractions,
                SessionDevice,
                NumImpressionsInClickout,
                SessionLengthOld,
                TimesImpressionAppearedInClickoutsSession,
                TimesUserInteractedWithImpression,
                TimingFromLastInteractionImpression,
                TopPopPerImpression,
                TopPopInteractionClickoutPerImpression,
                ChangeImpressionOrderPositionInSession,
                FrenzyFactorSession,
                DayOfWeekAndMomentInDay,
                LastClickoutFiltersSatisfaction,
                TimePerImpression,
                PersonalizedTopPop,
                PriceQuality,
                PlatformFeaturesSimilarity,
                LastActionBeforeClickout,
                ImpressionStarsNumeric,
                StepsBeforeLastClickout,
                LocationReferencePercentageOfClickouts,
                LocationReferencePercentageOfInteractions,
                NumTimesItemImpressed,
                PercClickPerImpressions,
                PlatformReferencePercentageOfClickouts,
                PlatformReferencePercentageOfInteractions,
                PlatformSession,
                User2ItemOld,
                LazyUser,
                PastFutureSessionFeatures
            ]
        if kind == 'kind1':
            # questo fa 0.6755 in locale coi param magici e senza NormalizedPlatformFeaturesSimilarity e SessionNumClickouts
            # fa 0.67566 con i seguenti params:
            # learning_rate=0.1366 min_child_weight=1 n_estimators=499
            # max_depth=10 subsample=1 colsample_bytree=1 reg_lambda=4.22 reg_alpha=10.72
            # fa 0.67588 con anche NormalizedPlatformFeaturesSimilarity e SessionNumClickouts
            features_array = [
                ImpressionPositionSession,
                ImpressionPriceInfoSessionOld,
                ImpressionRatingNumeric,
                ImpressionLabel,
                LastActionInvolvingImpression,
                MeanPriceClickout,
                AvgPriceInteractions,
                SessionDevice,
                NumImpressionsInClickout,
                SessionLengthOld,
                TimesImpressionAppearedInClickoutsSession,
                TimesUserInteractedWithImpression,
                TimingFromLastInteractionImpression,
                TopPopPerImpression,
                TopPopInteractionClickoutPerImpression,
                ChangeImpressionOrderPositionInSession,
                FrenzyFactorSession,
                DayOfWeekAndMomentInDay,
                LastClickoutFiltersSatisfaction,
                TimePerImpression,
                PersonalizedTopPop,
                PriceQuality,
                PlatformFeaturesSimilarity,
                LastActionBeforeClickout,
                ImpressionStarsNumeric,
                StepsBeforeLastClickout,
                LocationReferencePercentageOfClickouts,
                LocationReferencePercentageOfInteractions,
                NumTimesItemImpressed,
                PercClickPerImpressions,
                PlatformReferencePercentageOfClickouts,
                PlatformReferencePercentageOfInteractions,
                PlatformSession,
                User2ItemOld,
                LazyUser,
                PastFutureSessionFeatures,
                SessionSortOrderWhenClickout,
                SessionActionNumRefDiffFromImpressions,
                ActionsInvolvingImpressionSession,
                SessionNumClickouts
            ]

    train_df, test_df, train_idxs, _ = merge_features(mode, cluster, features_array, merge_kind='left')

    train_df = train_df.replace(-1, np.nan)
    test_df = test_df.replace(-1, np.nan)

    bp = 'dataset/preprocessed/{}/{}/xgboost/{}/'.format(cluster, mode, kind)
    check_folder(bp)
    train_df.to_csv(join(bp, 'train_df.csv'))

    if class_weights:
        weights = train_df[['user_id', 'session_id',
                            'weights']].drop_duplicates().weights.values
        print(len(weights))
        np.save(join(bp, 'class_weights'), weights)
        print('class weights saved')

    if class_weights:
        X_train = train_df.drop(
            ['index', 'user_id', 'session_id', 'item_id', 'label', 'weights'], axis=1)
    else:
        X_train = train_df.drop(
            ['index', 'user_id', 'session_id', 'item_id', 'label'], axis=1)

    if weights_position:
        weights = create_weights_position(train_df, mode,cluster)
        print(len(weights))
        np.save(join(bp, 'weights_position'), weights)
        print('weights_position saved')

    if log_weights:
        lg_w = create_log_weights(train_df)
        print(len(lg_w))
        np.save(join(bp, 'log_weights'), lg_w)
        print('log_weights saved')

    print(','.join(X_train.columns.values))
    X_train = X_train.to_sparse(fill_value=0)
    X_train = X_train.astype(np.float64)
    X_train = X_train.to_coo().tocsr()
    save_npz(join(bp, 'X_train'), X_train)
    print('X_train saved')

    user_session_item = train_df[['user_id', 'session_id', 'item_id']]
    user_session_item.to_csv(join(bp, 'user_session_item_train.csv'), index=False)

    y_train = train_df[['label']]
    y_train.to_csv(join(bp, 'y_train.csv'))
    print('y_train saved')

    group = create_groups(train_df)
    print(len(group))
    np.save(join(bp, 'group_train'), group)
    print('train groups saved')

    np.save(join(bp, 'train_indices'), train_idxs)

    print('train data completed')

    if class_weights:
        X_test = test_df.drop(
            ['index', 'user_id', 'session_id', 'item_id', 'label', 'weights'], axis=1)
    else:
        X_test = test_df.drop(
            ['index', 'user_id', 'session_id', 'item_id', 'label'], axis=1)

    X_test = X_test.to_sparse(fill_value=0)
    X_test = X_test.astype(np.float64)
    X_test = X_test.to_coo().tocsr()
    save_npz(join(bp, 'X_test'), X_test)
    print('X_test saved')

    user_session_item = test_df[['user_id', 'session_id', 'item_id']]
    user_session_item.to_csv(join(bp, 'user_session_item_test.csv'), index=False)

    y_test = test_df[['label']]
    y_test.to_csv(join(bp, 'y_test.csv'))
    print('y_test saved')

    group = create_groups(test_df)
    print(len(group))
    np.save(join(bp, 'group_test'), group)

    print('test groups saved')

    print('test data completed')
def create_dataset(mode, cluster, class_weights=False):
    # training
    kind = input('insert the kind: ')
    if cluster == 'no_cluster':
        if kind == 'kind3':
            features_array = [
                (ImpressionPositionSession, False),
                ImpressionLabel,
                #TopPopPerImpression,
                PersonalizedTopPop,
                #LastActionBeforeClickout,
                (LazyUser, False),
                (ScoresXGBoostDanParameter, False),
                (ClassifierPiccio, False),
                (ScoresCatboost, False),
                (ScoresXGBoostAccomodation, False),
                (ScoresRNN, False),
                AdjustedPlatformReferencePercentageOfClickouts,
                AdjustedLocationReferencePercentageOfInteractions,
                AdjustedPercClickPerImpressions,
                AdjustedPlatformReferencePercentageOfInteractions,
                AdjustedLocationReferencePercentageOfClickouts,
                PercClickPerPos,
                RefPopAfterFirstPosition,
                SessionNumClickouts,
                SessionNumFilterSel,
                SessionNumInterItemImage,
                SessionNumNotNumeric
            ]

    train_df, test_df, train_idxs, _ = merge_features(mode,
                                                      cluster,
                                                      features_array,
                                                      merge_kind='left')

    train_df = train_df.replace(-1, np.nan)
    test_df = test_df.replace(-1, np.nan)

    bp = 'dataset/preprocessed/{}/{}/xgboost/{}/'.format(cluster, mode, kind)
    check_folder(bp)

    if class_weights:
        weights = train_df[['user_id', 'session_id',
                            'weights']].drop_duplicates().weights.values
        print(len(weights))
        np.save(join(bp, 'class_weights'), weights)
        print('class weights saved')

    if class_weights:
        X_train = train_df.drop(
            ['index', 'user_id', 'session_id', 'item_id', 'label', 'weights'],
            axis=1)
    else:
        X_train = train_df.drop(
            ['index', 'user_id', 'session_id', 'item_id', 'label'], axis=1)
    print(','.join(X_train.columns.values))
    X_train = X_train.to_sparse(fill_value=0)
    X_train = X_train.astype(np.float64)
    X_train = X_train.to_coo().tocsr()
    save_npz(join(bp, 'X_train'), X_train)
    print('X_train saved')

    user_session_item = train_df[['user_id', 'session_id', 'item_id']]
    user_session_item.to_csv(join(bp, 'user_session_item_train.csv'),
                             index=False)

    y_train = train_df[['label']]
    y_train.to_csv(join(bp, 'y_train.csv'))
    print('y_train saved')

    group = create_groups(train_df)
    print(len(group))
    np.save(join(bp, 'group_train'), group)
    print('train groups saved')

    np.save(join(bp, 'train_indices'), train_idxs)

    print('train data completed')

    if class_weights:
        X_test = test_df.drop(
            ['index', 'user_id', 'session_id', 'item_id', 'label', 'weights'],
            axis=1)
    else:
        X_test = test_df.drop(
            ['index', 'user_id', 'session_id', 'item_id', 'label'], axis=1)

    X_test = X_test.to_sparse(fill_value=0)
    X_test = X_test.astype(np.float64)
    X_test = X_test.to_coo().tocsr()
    save_npz(join(bp, 'X_test'), X_test)
    print('X_test saved')

    user_session_item = test_df[['user_id', 'session_id', 'item_id']]
    user_session_item.to_csv(join(bp, 'user_session_item_test.csv'),
                             index=False)

    y_test = test_df[['label']]
    y_test.to_csv(join(bp, 'y_test.csv'))
    print('y_test saved')

    group = create_groups(test_df)
    print(len(group))
    np.save(join(bp, 'group_test'), group)

    print('test groups saved')

    print('test data completed')