def create_dataset(mode, cluster): features_array = [ ImpressionLabel, ImpressionPositionSession, ScoresRNN, ScoresXGB ] train_df, test_df, train_idxs, _ = merge_features(mode, cluster, features_array, merge_kind='left') train_df = train_df.replace(-1, np.nan) test_df = test_df.replace(-1, np.nan) bp = 'dataset/preprocessed/{}/{}/stacking/'.format(cluster, mode) check_folder(bp) X_train = train_df.drop( ['index', 'user_id', 'session_id', 'item_id', 'label'], axis=1) X_train = X_train.to_sparse(fill_value=0) X_train = X_train.astype(np.float64) X_train = X_train.to_coo().tocsr() save_npz(join(bp, 'X_train'), X_train) print('X_train saved') y_train = train_df[['label']] y_train.to_csv(join(bp, 'y_train.csv')) print('y_train saved') group = create_groups(train_df) print(len(group)) np.save(join(bp, 'group_train'), group) print('train groups saved') np.save(join(bp, 'train_indices'), train_idxs) print('train data completed') X_test = test_df.drop( ['index', 'user_id', 'session_id', 'item_id', 'label'], axis=1) # if mode == 'full': X_test = X_test.to_sparse(fill_value=0) X_test = X_test.astype(np.float64) X_test = X_test.to_coo().tocsr() save_npz(join(bp, 'X_test'), X_test) # else: # X_test.to_csv(join(bp, 'X_test.csv')) print('X_test saved') y_test = test_df[['label']] y_test.to_csv(join(bp, 'y_test.csv')) print('y_test saved') group = create_groups(test_df) print(len(group)) np.save(join(bp, 'group_test'), group) print('test groups saved') print('test data completed')
def create_dataset(mode, cluster): # training features_array = [ ActionsInvolvingImpressionSession, ImpressionLabel, ImpressionPriceInfoSession, TimingFromLastInteractionImpression, TimesUserInteractedWithImpression, ImpressionPositionSession, LastInteractionInvolvingImpression, TimesImpressionAppearedInClickoutsSession, MeanPriceClickout, SessionLength, TimeFromLastActionBeforeClk, FrenzyFactorSession, PricePositionInfoInteractedReferences, SessionDevice, SessionFilterActiveWhenClickout, SessionSortOrderWhenClickout, ImpressionFeature ] curr_dir = Path(__file__).absolute().parent data_dir = curr_dir.joinpath( '..', 'dataset/preprocessed/{}/{}/lightGBM/'.format(cluster, mode)) print(data_dir) check_folder(str(data_dir)) train_df, test_df = merge_features(mode, cluster, features_array) if os.path.isfile(str(data_dir) + '/svmlight_train.txt'): print('Train File già presente') else: to_queries_dataset(train_df, path=str(data_dir) + '/svmlight_train.txt') if os.path.isfile(str(data_dir) + '/test.csv'): print('Test File già presente') #test_df.sort_values() to_queries_dataset(test_df, path=str(data_dir) + '/svmlight_test.txt') else: test_df.to_csv(str(data_dir) + '/test.csv', index=False) to_queries_dataset(test_df, path=str(data_dir) + '/svmlight_test.txt')
def create_dataset(mode, cluster): features_array = [ ImpressionPositionSession, ImpressionPriceInfoSessionOld, ImpressionRatingNumeric, ImpressionLabel, LastActionInvolvingImpression, MeanPriceClickout, AvgPriceInteractions, SessionDevice, NumImpressionsInClickout, SessionLengthOld, TimesImpressionAppearedInClickoutsSession, TimesUserInteractedWithImpression, TimingFromLastInteractionImpression, TopPopPerImpression, TopPopInteractionClickoutPerImpression, ChangeImpressionOrderPositionInSession, FrenzyFactorSession, DayOfWeekAndMomentInDay, LastClickoutFiltersSatisfaction, TimePerImpression, PersonalizedTopPop, PriceQuality, PlatformFeaturesSimilarity, LastActionBeforeClickout, ImpressionStarsNumeric, StepsBeforeLastClickout, LocationReferencePercentageOfClickouts, LocationReferencePercentageOfInteractions, NumTimesItemImpressed, PercClickPerImpressions, PlatformReferencePercentageOfClickouts, PlatformReferencePercentageOfInteractions, PlatformSession, User2ItemOld, LazyUser, PastFutureSessionFeatures, SessionSortOrderWhenClickout, SessionActionNumRefDiffFromImpressions, ActionsInvolvingImpressionSession, SessionNumClickouts ] curr_dir = Path(__file__).absolute().parent data_dir = curr_dir.joinpath( '..', 'dataset/preprocessed/{}/{}/catboost/'.format(cluster, mode)) print(data_dir) check_folder(str(data_dir)) train_df, test_df, _, __ = merge_features( mode, cluster, features_array, merge_kind='left', onehot=False, create_not_existing_features=True) train_df = train_df.fillna(-1) test_df = test_df.fillna(-1) train_df.to_csv(str(data_dir) + '/train.csv', index=False) #to_pool_dataset(train_df, path=str(data_dir) + '/catboost_train.txt') print('Train saved') test_df.to_csv(str(data_dir) + '/test.csv', index=False)
def create_dataset(mode, cluster, class_weights=False): # training kind = input('insert the kind: ') if kind == 'label': features_array = [ImpressionLabel] if kind == 'no_bias': features_array = [ PlatformSession, PlatformFeaturesSimilarity, AvgPriceInteractions, ChangeImpressionOrderPositionInSession, DayOfWeekAndMomentInDay, FrenzyFactorSession, ImpressionPositionSession, ImpressionPriceInfoSessionOld, ImpressionRatingNumeric, ActionsInvolvingImpressionSession, SessionNumClickouts, ImpressionStarsNumeric, ImpressionLabel, LastActionInvolvingImpression, LastActionBeforeClickout, TimesImpressionAppearedInClickoutsSession, LastClickoutFiltersSatisfaction, StepsBeforeLastClickout, LazyUser, MeanPriceClickout, NumImpressionsInClickout, SessionLengthOld, NumTimesItemImpressed, PercClickPerImpressions, PlatformReferencePercentageOfClickouts, PriceQuality, SessionDevice, SessionSortOrderWhenClickout, TimePerImpression, TimesUserInteractedWithImpression, TimingFromLastInteractionImpression, TopPopInteractionClickoutPerImpression, TopPopPerImpression, User2ItemOld, ] if kind=='content': features_array = [ ImpressionPriceInfoSessionOld, ImpressionRatingNumeric, ImpressionLabel, MeanPriceClickout, SessionDevice, NumImpressionsInClickout, SessionLengthOld, TopPopPerImpression, TopPopInteractionClickoutPerImpression, ChangeImpressionOrderPositionInSession, FrenzyFactorSession, DayOfWeekAndMomentInDay, LastClickoutFiltersSatisfaction, PersonalizedTopPop, PriceQuality, PlatformFeaturesSimilarity, LastActionBeforeClickout, ImpressionStarsNumeric, StepsBeforeLastClickout, LocationReferencePercentageOfClickouts, LocationReferencePercentageOfInteractions, NumTimesItemImpressed, PercClickPerImpressions, PlatformReferencePercentageOfClickouts, PlatformReferencePercentageOfInteractions, User2ItemOld, SessionSortOrderWhenClickout, SessionActionNumRefDiffFromImpressions, SessionNumClickouts ] if kind == 'session_filters': features_array = [SessionFilterActiveWhenClickout, ImpressionLabel] if kind=='impression_feature': features_array = [ImpressionFeatureCleaned, ImpressionLabel] if kind == 'kind2': features_array = [ (LazyUser, False), PriceQuality, PlatformFeaturesSimilarity, PersonalizedTopPop, TimePerImpression, DayOfWeekAndMomentInDay, LastClickoutFiltersSatisfaction, FrenzyFactorSession, ChangeImpressionOrderPositionInSession, User2Item, PlatformSession, PlatformReferencePercentageOfInteractions, PercClickPerImpressions, PlatformReferencePercentageOfClickouts, NumImpressionsInClickout, NumTimesItemImpressed, LocationReferencePercentageOfClickouts, LocationReferencePercentageOfInteractions, StepsBeforeLastClickout, ImpressionStarsNumeric, LastActionBeforeClickout, TopPopPerImpression, TopPopInteractionClickoutPerImpression, ImpressionRatingNumeric, ActionsInvolvingImpressionSession, ImpressionLabel, ImpressionPriceInfoSession, TimingFromLastInteractionImpression, TimesUserInteractedWithImpression, ImpressionPositionSession, LastActionInvolvingImpression, SessionDevice, SessionSortOrderWhenClickout, MeanPriceClickout, PriceInfoSession, SessionLength, TimesImpressionAppearedInClickoutsSession] if kind == 'kind1': features_array = [ ImpressionPositionSession, ImpressionPriceInfoSessionOld, ImpressionRatingNumeric, ImpressionLabel, LastActionInvolvingImpression, MeanPriceClickout, AvgPriceInteractions, SessionDevice, NumImpressionsInClickout, SessionLengthOld, TimesImpressionAppearedInClickoutsSession, TimesUserInteractedWithImpression, TimingFromLastInteractionImpression, TopPopPerImpression, TopPopInteractionClickoutPerImpression, ChangeImpressionOrderPositionInSession, FrenzyFactorSession, DayOfWeekAndMomentInDay, LastClickoutFiltersSatisfaction, TimePerImpression, PersonalizedTopPop, PriceQuality, PlatformFeaturesSimilarity, LastActionBeforeClickout, ImpressionStarsNumeric, StepsBeforeLastClickout, LocationReferencePercentageOfClickouts, LocationReferencePercentageOfInteractions, NumTimesItemImpressed, PercClickPerImpressions, PlatformReferencePercentageOfClickouts, PlatformReferencePercentageOfInteractions, PlatformSession, User2ItemOld, LazyUser, PastFutureSessionFeatures, SessionSortOrderWhenClickout, SessionActionNumRefDiffFromImpressions, ActionsInvolvingImpressionSession, SessionNumClickouts ] scores_array = [ # 'rnn_classifier.csv.gz', # 'rnn_no_bias_balanced.csv.gz', # 'scores_softmax_loss.csv.gz', # 'xgboost_impr_features.csv.gz', # 'rnn_GRU_2layers_64units_2dense_noclass0.csv.gz', # 'scores_pairwise_soft_zero_one_loss.csv.gz', # 'xgb_forte_700.csv.gz', ] train_df, test_df, train_idxs, _ = merge_features(mode, cluster, features_array, merge_kind='left', multithread=False) if len(scores_array) > 0: for path in scores_array: score = pd.read_csv('scores/{}'.format(path)) if 'item_id' in score.columns: print('item_id found') cols = [c for c in score.columns if c in ['user_id', 'session_id', 'item_id'] or 'score' in c] score = score[cols] score = score.groupby(['user_id', 'session_id', 'item_id'], as_index=False).last() train_df = train_df.merge(score, on=['user_id', 'session_id', 'item_id'], how='left') test_df = test_df.merge(score, on=['user_id', 'session_id', 'item_id'], how='left') print(f'train_shape: {train_df.shape}\n vali_shape: {test_df.shape}') else: print('item_id not found') cols = [c for c in score.columns if c in ['user_id', 'session_id'] or 'score' in c] score = score[cols] score = score.groupby(['user_id', 'session_id'], as_index=False).last() train_df = train_df.merge(score, on=['user_id', 'session_id'], how='left') test_df = test_df.merge(score, on=['user_id', 'session_id'], how='left') print(f'train_shape: {train_df.shape}\n vali_shape: {test_df.shape}') train_df = train_df.replace(-1, np.nan) test_df = test_df.replace(-1, np.nan) bp = 'dataset/preprocessed/{}/{}/xgboost/{}/'.format(cluster, mode, kind) check_folder(bp) if class_weights: weights = train_df[['user_id', 'session_id', 'weights']].drop_duplicates().weights.values print(len(weights)) np.save(join(bp, 'class_weights'), weights) print('class weights saved') if class_weights: X_train = train_df.drop( ['index', 'user_id', 'session_id', 'item_id', 'label', 'weights'], axis=1) else: X_train = train_df.drop( ['index', 'user_id', 'session_id', 'item_id', 'label'], axis=1) print(','.join(X_train.columns.values)) X_train = X_train.to_sparse(fill_value=0) X_train = X_train.astype(np.float64) X_train = X_train.to_coo().tocsr() save_npz(join(bp, 'X_train'), X_train) print('X_train saved') user_session_item = train_df[['user_id', 'session_id', 'item_id']] user_session_item.to_csv(join(bp, 'user_session_item_train.csv'), index=False) y_train = train_df[['label']] y_train.to_csv(join(bp, 'y_train.csv')) print('y_train saved') group = create_groups(train_df) print(len(group)) np.save(join(bp, 'group_train'), group) print('train groups saved') np.save(join(bp, 'train_indices'), train_idxs) print('train data completed') if class_weights: X_test = test_df.drop( ['index', 'user_id', 'session_id', 'item_id', 'label', 'weights'], axis=1) else: X_test = test_df.drop( ['index', 'user_id', 'session_id', 'item_id', 'label'], axis=1) X_test = X_test.to_sparse(fill_value=0) X_test = X_test.astype(np.float64) X_test = X_test.to_coo().tocsr() save_npz(join(bp, 'X_test'), X_test) print('X_test saved') user_session_item = test_df[['user_id', 'session_id', 'item_id']] user_session_item.to_csv(join(bp, 'user_session_item_test.csv'), index=False) y_test = test_df[['label']] y_test.to_csv(join(bp, 'y_test.csv')) print('y_test saved') group = create_groups(test_df) print(len(group)) np.save(join(bp, 'group_test'), group) print('test groups saved') print('test data completed')
def create_dataset(mode, cluster, class_weights=False, weights_position=True, log_weights=True): # training kind = input('insert the kind: ') if cluster == 'no_cluster' or True: if kind == 'kind2': # questo fa 0.6755 in locale + NormalizedPlatformFeaturesSimilarity, SessionNumClickouts fa 0.67588 features_array = [ ImpressionLabel, PastFutureSessionFeatures ] if kind == 'kind3': # questo è quello che usa dani su cat features_array = [ ActionsInvolvingImpressionSession, ImpressionPositionSession, ImpressionPriceInfoSessionOld, ImpressionRatingNumeric, ImpressionLabel, LastActionInvolvingImpression, MeanPriceClickout, AvgPriceInteractions, SessionDevice, NumImpressionsInClickout, SessionLengthOld, TimesImpressionAppearedInClickoutsSession, TimesUserInteractedWithImpression, TimingFromLastInteractionImpression, TopPopPerImpression, TopPopInteractionClickoutPerImpression, ChangeImpressionOrderPositionInSession, FrenzyFactorSession, DayOfWeekAndMomentInDay, LastClickoutFiltersSatisfaction, TimePerImpression, PersonalizedTopPop, PriceQuality, PlatformFeaturesSimilarity, LastActionBeforeClickout, ImpressionStarsNumeric, StepsBeforeLastClickout, LocationReferencePercentageOfClickouts, LocationReferencePercentageOfInteractions, NumTimesItemImpressed, PercClickPerImpressions, PlatformReferencePercentageOfClickouts, PlatformReferencePercentageOfInteractions, PlatformSession, User2ItemOld, LazyUser, PastFutureSessionFeatures ] if kind == 'kind1': # questo fa 0.6755 in locale coi param magici e senza NormalizedPlatformFeaturesSimilarity e SessionNumClickouts # fa 0.67566 con i seguenti params: # learning_rate=0.1366 min_child_weight=1 n_estimators=499 # max_depth=10 subsample=1 colsample_bytree=1 reg_lambda=4.22 reg_alpha=10.72 # fa 0.67588 con anche NormalizedPlatformFeaturesSimilarity e SessionNumClickouts features_array = [ ImpressionPositionSession, ImpressionPriceInfoSessionOld, ImpressionRatingNumeric, ImpressionLabel, LastActionInvolvingImpression, MeanPriceClickout, AvgPriceInteractions, SessionDevice, NumImpressionsInClickout, SessionLengthOld, TimesImpressionAppearedInClickoutsSession, TimesUserInteractedWithImpression, TimingFromLastInteractionImpression, TopPopPerImpression, TopPopInteractionClickoutPerImpression, ChangeImpressionOrderPositionInSession, FrenzyFactorSession, DayOfWeekAndMomentInDay, LastClickoutFiltersSatisfaction, TimePerImpression, PersonalizedTopPop, PriceQuality, PlatformFeaturesSimilarity, LastActionBeforeClickout, ImpressionStarsNumeric, StepsBeforeLastClickout, LocationReferencePercentageOfClickouts, LocationReferencePercentageOfInteractions, NumTimesItemImpressed, PercClickPerImpressions, PlatformReferencePercentageOfClickouts, PlatformReferencePercentageOfInteractions, PlatformSession, User2ItemOld, LazyUser, PastFutureSessionFeatures, SessionSortOrderWhenClickout, SessionActionNumRefDiffFromImpressions, ActionsInvolvingImpressionSession, SessionNumClickouts ] train_df, test_df, train_idxs, _ = merge_features(mode, cluster, features_array, merge_kind='left') train_df = train_df.replace(-1, np.nan) test_df = test_df.replace(-1, np.nan) bp = 'dataset/preprocessed/{}/{}/xgboost/{}/'.format(cluster, mode, kind) check_folder(bp) train_df.to_csv(join(bp, 'train_df.csv')) if class_weights: weights = train_df[['user_id', 'session_id', 'weights']].drop_duplicates().weights.values print(len(weights)) np.save(join(bp, 'class_weights'), weights) print('class weights saved') if class_weights: X_train = train_df.drop( ['index', 'user_id', 'session_id', 'item_id', 'label', 'weights'], axis=1) else: X_train = train_df.drop( ['index', 'user_id', 'session_id', 'item_id', 'label'], axis=1) if weights_position: weights = create_weights_position(train_df, mode,cluster) print(len(weights)) np.save(join(bp, 'weights_position'), weights) print('weights_position saved') if log_weights: lg_w = create_log_weights(train_df) print(len(lg_w)) np.save(join(bp, 'log_weights'), lg_w) print('log_weights saved') print(','.join(X_train.columns.values)) X_train = X_train.to_sparse(fill_value=0) X_train = X_train.astype(np.float64) X_train = X_train.to_coo().tocsr() save_npz(join(bp, 'X_train'), X_train) print('X_train saved') user_session_item = train_df[['user_id', 'session_id', 'item_id']] user_session_item.to_csv(join(bp, 'user_session_item_train.csv'), index=False) y_train = train_df[['label']] y_train.to_csv(join(bp, 'y_train.csv')) print('y_train saved') group = create_groups(train_df) print(len(group)) np.save(join(bp, 'group_train'), group) print('train groups saved') np.save(join(bp, 'train_indices'), train_idxs) print('train data completed') if class_weights: X_test = test_df.drop( ['index', 'user_id', 'session_id', 'item_id', 'label', 'weights'], axis=1) else: X_test = test_df.drop( ['index', 'user_id', 'session_id', 'item_id', 'label'], axis=1) X_test = X_test.to_sparse(fill_value=0) X_test = X_test.astype(np.float64) X_test = X_test.to_coo().tocsr() save_npz(join(bp, 'X_test'), X_test) print('X_test saved') user_session_item = test_df[['user_id', 'session_id', 'item_id']] user_session_item.to_csv(join(bp, 'user_session_item_test.csv'), index=False) y_test = test_df[['label']] y_test.to_csv(join(bp, 'y_test.csv')) print('y_test saved') group = create_groups(test_df) print(len(group)) np.save(join(bp, 'group_test'), group) print('test groups saved') print('test data completed')
def create_dataset(mode, cluster, class_weights=False): # training kind = input('insert the kind: ') if cluster == 'no_cluster': if kind == 'kind3': features_array = [ (ImpressionPositionSession, False), ImpressionLabel, #TopPopPerImpression, PersonalizedTopPop, #LastActionBeforeClickout, (LazyUser, False), (ScoresXGBoostDanParameter, False), (ClassifierPiccio, False), (ScoresCatboost, False), (ScoresXGBoostAccomodation, False), (ScoresRNN, False), AdjustedPlatformReferencePercentageOfClickouts, AdjustedLocationReferencePercentageOfInteractions, AdjustedPercClickPerImpressions, AdjustedPlatformReferencePercentageOfInteractions, AdjustedLocationReferencePercentageOfClickouts, PercClickPerPos, RefPopAfterFirstPosition, SessionNumClickouts, SessionNumFilterSel, SessionNumInterItemImage, SessionNumNotNumeric ] train_df, test_df, train_idxs, _ = merge_features(mode, cluster, features_array, merge_kind='left') train_df = train_df.replace(-1, np.nan) test_df = test_df.replace(-1, np.nan) bp = 'dataset/preprocessed/{}/{}/xgboost/{}/'.format(cluster, mode, kind) check_folder(bp) if class_weights: weights = train_df[['user_id', 'session_id', 'weights']].drop_duplicates().weights.values print(len(weights)) np.save(join(bp, 'class_weights'), weights) print('class weights saved') if class_weights: X_train = train_df.drop( ['index', 'user_id', 'session_id', 'item_id', 'label', 'weights'], axis=1) else: X_train = train_df.drop( ['index', 'user_id', 'session_id', 'item_id', 'label'], axis=1) print(','.join(X_train.columns.values)) X_train = X_train.to_sparse(fill_value=0) X_train = X_train.astype(np.float64) X_train = X_train.to_coo().tocsr() save_npz(join(bp, 'X_train'), X_train) print('X_train saved') user_session_item = train_df[['user_id', 'session_id', 'item_id']] user_session_item.to_csv(join(bp, 'user_session_item_train.csv'), index=False) y_train = train_df[['label']] y_train.to_csv(join(bp, 'y_train.csv')) print('y_train saved') group = create_groups(train_df) print(len(group)) np.save(join(bp, 'group_train'), group) print('train groups saved') np.save(join(bp, 'train_indices'), train_idxs) print('train data completed') if class_weights: X_test = test_df.drop( ['index', 'user_id', 'session_id', 'item_id', 'label', 'weights'], axis=1) else: X_test = test_df.drop( ['index', 'user_id', 'session_id', 'item_id', 'label'], axis=1) X_test = X_test.to_sparse(fill_value=0) X_test = X_test.astype(np.float64) X_test = X_test.to_coo().tocsr() save_npz(join(bp, 'X_test'), X_test) print('X_test saved') user_session_item = test_df[['user_id', 'session_id', 'item_id']] user_session_item.to_csv(join(bp, 'user_session_item_test.csv'), index=False) y_test = test_df[['label']] y_test.to_csv(join(bp, 'y_test.csv')) print('y_test saved') group = create_groups(test_df) print(len(group)) np.save(join(bp, 'group_test'), group) print('test groups saved') print('test data completed')