Beispiel #1
0
def run_final_models(df, continuous_vars, categorical_vars, outcome_var, train_indicator=""):
    continuous_prediction = run_continuous_categorical_models(df, continuous_vars, categorical_vars, outcome_var, train_indicator)
    binary_prediction = run_binary_categorical_models(df, continuous_vars, categorical_vars, outcome_var, train_indicator)
    svm_prediction = run_svm_prediction(df, continuous_vars, categorical_vars, outcome_var, train_indicator)

    predictions = np.mean([continuous_prediction, binary_prediction, svm_prediction], axis=0)

    predictions_df = pd.DataFrame({'bidder_id': df.index[df[train_indicator] == False],
                                   'prediction': predictions})

    predictions_df.to_csv(derived_data_path + 'predictions.csv', index=False)

    encode_predictions(derived_data_path, 'SoVeryClassy')


    return predictions
Beispiel #2
0
def facebook_modelling(derived_data_path):

    store = pd.HDFStore(derived_data_path + "data.h5")

    train = store['train']
    test = store['test']

    # set indices to merge
    train.set_index("bidder_id", inplace=True)
    test.set_index("bidder_id", inplace=True)

    store.close()

    store = pd.HDFStore(derived_data_path + "derived_data.h5")
    bidder = store['bidder_level'].join(pd.concat([train, test]), how="inner")
    store.close()


    # Merge everything. Only keeping inner (will need to later remerge on original files)
    # bidder = bidder_level.join(pd.concat([train, test]), how="inner")

    # indicator for whether in train or test dataset
    bidder['train'] = ~bidder['outcome'].isnull()
    bidder['test'] = ~bidder['train']

    for var in ['most_freq_country', 'most_freq_merchandise']:
        encoder = LabelEncoder().fit(bidder[var])
        bidder[var] = encoder.transform(bidder[var])

    # bidder = bidder[bidder.train == True]

    #To create:
    # num_bids/num_auctions
    bidder['bids_per_auction'] = bidder['num_bids'] / (bidder['num_auctions']+1)

    # num_bids/length_activity
    bidder['bids_per_time'] = bidder['num_bids'] / (bidder['length_activity']+1)

    # recode the dummies for tiny groups

    threshold = len(bidder)/100
    vars_to_recode = ['most_freq_device', 'most_freq_merchandise']
    for var in vars_to_recode:
        num = bidder[var].value_counts()
        small_val = num[num < threshold].index
        bidder.loc[bidder[var].apply(lambda x: x in small_val), var] = -1

    bidder.std_change_time.fillna(-1, inplace=True)
    bidder.mean_change_time.fillna(-1, inplace=True)

    # Model for each time period?
    # // 10 for 1st-3rd ip
    # in period 1
    # in period 2
    # in period 3
    # off-peak or peak times
    # change time
    # Dummy for more than one for all of the max

    continuous_vars = ['num_auctions',
                       'num_bids',
                       'num_url',
                       'num_country',
                       'num_device',
                       'num_merchandise',
                       'num_ip',
                       'num_first_bid',
                       'pct_first_bid',
                       'num_winner',
                       'pct_winner',
                       'num_outbid_self',
                       'num_outbid_self_diff_time',
                       'first_bid',
                       'last_bid',
                       'length_activity',
                       'num_periods',
                       'max_simul_actions',
                       'max_simul_auction',
                       'max_simul_country',
                       'max_simul_device',
                       'max_simul_ip',
                       'bids_per_auction',
                       'bids_per_time',
                       'most_common_auction_type',
                       'periods_seen',
                       'mean_change_time',
                       'std_change_time',
                       'std_pct_auction_bids',
                       'mean_pct_auction_bids',
                       'mean_num_bidders_auction',
                       'mean_num_bids_auction',
                       'mean_bids_timestamp',
                       'std_bids_timestamp']

    categorical_vars = ['most_freq_device',
                        'primary_period',
                        'most_freq_merchandise',
                        'most_freq_country']

    predictor_vars = continuous_vars + categorical_vars



    outcome_var = 'outcome'

    df = bidder[bidder.train == True]

    X_train, y_train, X_test, y_test = test_train_data(df, continuous_vars, categorical_vars, outcome_var, categorical='binary', seed = 56, test_pct=0)

    test = [pd.get_dummies(df.pop(var)) for var in categorical_vars]

    run_random_forest_model(bidder[bidder.train == True], continuous_vars, categorical_vars, outcome_var)
    run_random_forest_model(bidder, predictor_vars, outcome_var, 999)

    generate_predictions(bidder, continuous_vars, categorical_vars, outcome_var, 'train', derived_data_path)
    encode_predictions(derived_data_path, 'that was unexpected binary gbm')