dtypes = { 'ip': 'uint32', 'app': 'uint16', 'device': 'uint16', 'os': 'uint16', 'channel': 'uint16', 'is_attributed': 'uint8', 'click_id': 'uint32' } import feature_engineerer import gc train_df = pd.read_csv('../input/train.csv', nrows=1000 * 100, dtype=dtypes) feature_engineerer.do_feature_engineering(train_df) len_train = len(train_df) val_df = train_df[(len_train - 300000):len_train] train_df = train_df[:(len_train - 300000)] target = 'is_attributed' predictors = ['app', 'device', 'os', 'channel', 'hour', 'hourly_click_count'] categorical = ['app', 'device', 'os', 'channel', 'hour'] print("Training...") params = { 'learning_rate': 0.1, #'is_unbalance': 'true', # replaced with scale_pos_weight argument 'num_leaves': 1400, # we should let it be smaller than 2^(max_depth) 'max_depth': 3, # -1 means no limit 'min_child_samples':
import pandas as pd import feature_engineerer train = pd.read_csv('../input/train_first_1000k.csv') feature_engineerer.do_feature_engineering(train) print(train.describe()) print(train["ip"].nunique()) exit(0) train.to_csv("../output/engineered_first_1000k.csv", float_format='%.6f', index=False)
import pandas as pd import csv_loader import feature_engineerer dtypes = csv_loader.get_dtypes() df = pd.read_csv('../input/test.csv', dtype=dtypes, nrows=100000) feature_engineerer.do_feature_engineering(df) print(df.head()) df.to_csv("../output/checking.csv", index=False)
import pandas as pd import numpy as np import pocket_lgb from sklearn import model_selection import feature_engineerer import holdout_validator import csv_loader dtypes = csv_loader.get_dtypes() input_df = pd.read_csv('../input/train_day3.csv', nrows=1000000, dtype=dtypes) feature_engineerer.do_feature_engineering(input_df) print(input_df.describe()) split_number = 5 skf = model_selection.KFold(n_splits=split_number) lgb = pocket_lgb.GoldenLgb() first_model = None total_score = 0 for train_index, test_index in skf.split(input_df): train_np = input_df.iloc[train_index] test_np = input_df.iloc[test_index] train_df = pd.DataFrame(train_np) test_df = pd.DataFrame(test_np) model = lgb.do_train(train_df, test_df) score = model.best_score["valid_0"]["auc"] total_score += score if first_model is None: first_model = model
import pandas as pd import feature_engineerer import csv_loader predictors = ['app', 'device', 'os', 'channel', 'hour', 'hourly_click_count'] categorical = ['app', 'device', 'os', 'channel', 'hour'] dtypes = csv_loader.get_dtypes() test = pd.read_csv('../input/test.csv', dtype=dtypes, nrows=10000) print(test.describe()) feature_engineerer.do_feature_engineering(test) print(test.describe()) wtf = test[predictors] print(wtf.describe())