Ejemplo n.º 1
0
                                            'F3': 'Counts',
                                            'F6': 'Sequent Clicks',
                                            'F7': 'Time Difference'
                                        })
    else:
        what_to_buy = extract_what_to_buy(clicks_grouped_by_session_id)
        features_to_csv(file_what_to_buy_features, what_to_buy.values())

    if os.path.isfile(file_buy_or_not_features):
        buy_or_not = np.load(file_buy_or_not_features)
    else:
        buy_or_not = extract_buy_or_not(clicks_grouped_by_session_id,
                                        what_to_buy)
        np.save(file_buy_or_not_features, buy_or_not)

    buys = read_buys(file_buys, usecols=effective_columns_names)
    _, buys_grouped_by_session_id_keys = df_group_by_session_id(buys)
    buys_result = extract_buys(clicks_grouped_by_session_id_keys,
                               buys_grouped_by_session_id_keys)

    buy_or_not_train, buy_or_not_val, buys_result_train, buys_result_val = train_test_split(
        buy_or_not, buys_result, test_size=0.2)

    classifier = XGBClassifier(n_estimators=500,
                               subsample=0.8,
                               colsample_bytree=0.5,
                               max_depth=4,
                               min_child_weight=3)
    classifier.fit(buy_or_not_train, buys_result_train)

    predictions_val = classifier.predict(buy_or_not_val)
Ejemplo n.º 2
0
    clicks_grouped_by_session_id, clicks_grouped_by_session_id_keys = df_group_by_session_id(clicks)

    if os.path.isfile(file_what_to_buy_features):
        what_to_buy = features_from_csv(file_what_to_buy_features, ['Session ID', 'Item ID'],
                                        {'F3': 'Counts', 'F6': 'Sequent Clicks', 'F7': 'Time Difference'})
    else:
        what_to_buy = extract_what_to_buy(clicks_grouped_by_session_id)
        features_to_csv(file_what_to_buy_features, what_to_buy.values())

    if os.path.isfile(file_buy_or_not_features):
        buy_or_not = np.load(file_buy_or_not_features)
    else:
        buy_or_not = extract_buy_or_not(clicks_grouped_by_session_id, what_to_buy)
        np.save(file_buy_or_not_features, buy_or_not)

    buys = read_buys(file_buys, usecols=effective_columns_names)
    _, buys_grouped_by_session_id_keys = df_group_by_session_id(buys)
    buys_result = extract_buys(clicks_grouped_by_session_id_keys, buys_grouped_by_session_id_keys)

    buy_or_not_train, buy_or_not_val, buys_result_train, buys_result_val = train_test_split(buy_or_not, buys_result,
                                                                                            test_size=0.2)

    classifier = XGBClassifier(n_estimators=500, subsample=0.8, colsample_bytree=0.5, max_depth=4, min_child_weight=3)
    classifier.fit(buy_or_not_train, buys_result_train)

    predictions_val = classifier.predict(buy_or_not_val)
    scores = metrics(buys_result_val, predictions_val)
    write_metrics(file_scores, scores)

    test = read_clicks(file_test, usecols=effective_columns_names)
    test_grouped_by_session_id, _ = df_group_by_session_id(test)
Ejemplo n.º 3
0
if __name__ == "__main__":
    assert len(sys.argv) == 5, r"Incorrect argument list, names also shouldn't contain spaces" + \
                               "\nusage: argv[0] /dir/with/data clicks_file buys_file frac"

    path = sys.argv[1]
    file_clicks_basename = sys.argv[2]
    file_buys_basename = sys.argv[3]
    char_frac = sys.argv[4]
    float_frac = float(char_frac)

    assert float_frac > 0, 'Frac must be a positive number'

    frac = float_frac if float_frac <= 1 else int(float_frac)
    if frac == 1:
        exit(0)

    file_clicks_parts = os.path.splitext(file_clicks_basename)
    file_buys_parts = os.path.splitext(file_buys_basename)
    file_clicks_sliced_basename = file_clicks_parts[0] + '-' + char_frac + file_clicks_parts[1]
    file_buys_sliced_basename = file_buys_parts[0] + '-' + char_frac + file_buys_parts[1]
    file_clicks = os.path.join(path, file_clicks_basename)
    file_buys = os.path.join(path, file_buys_basename)
    file_clicks_sliced = os.path.join(path, file_clicks_sliced_basename)
    file_buys_sliced = os.path.join(path, file_buys_sliced_basename)

    clicks = read_clicks(file_clicks)
    buys = read_buys(file_buys)
    clicks, buys = slice_data(clicks, buys, frac=frac)

    write_df(file_clicks_sliced, clicks)
    write_df(file_buys_sliced, buys)
Ejemplo n.º 4
0
                               "\nusage: argv[0] /dir/with/data clicks_file buys_file frac"

    path = sys.argv[1]
    file_clicks_basename = sys.argv[2]
    file_buys_basename = sys.argv[3]
    char_frac = sys.argv[4]
    float_frac = float(char_frac)

    assert float_frac > 0, 'Frac must be a positive number'

    frac = float_frac if float_frac <= 1 else int(float_frac)
    if frac == 1:
        exit(0)

    file_clicks_parts = os.path.splitext(file_clicks_basename)
    file_buys_parts = os.path.splitext(file_buys_basename)
    file_clicks_sliced_basename = file_clicks_parts[
        0] + '-' + char_frac + file_clicks_parts[1]
    file_buys_sliced_basename = file_buys_parts[
        0] + '-' + char_frac + file_buys_parts[1]
    file_clicks = os.path.join(path, file_clicks_basename)
    file_buys = os.path.join(path, file_buys_basename)
    file_clicks_sliced = os.path.join(path, file_clicks_sliced_basename)
    file_buys_sliced = os.path.join(path, file_buys_sliced_basename)

    clicks = read_clicks(file_clicks)
    buys = read_buys(file_buys)
    clicks, buys = slice_data(clicks, buys, frac=frac)

    write_df(file_clicks_sliced, clicks)
    write_df(file_buys_sliced, buys)