def __init__(self, model, filename=None, num_rows=None): self.logger = pocket_logger.get_my_logger() self.model = model use_col = feature_engineerer.get_necessary_col() dtypes = csv_loader.get_featured_dtypes() local_filename = HOLDOUT_DATA if filename is None: filename = HOLDOUT_DATA if num_rows is None: self.holdout_df = dd.read_csv(filename, dtype=dtypes, usecols=use_col).compute() else: self.holdout_df = dd.read_csv(filename, dtype=dtypes, nrows=num_rows, usecols=use_col).compute() print(self.holdout_df.info()) print("Initialized validator.")
TRAIN_DATA = os.path.join(OUTPUT_DIR, "full_train_day3_featured.csv") TEST_DATA = os.path.join(OUTPUT_DIR, "full_test_featured.csv") OUTPUT_FILE = os.path.join(OUTPUT_DIR, "submission_full.csv") import pandas as pd import numpy as np from sklearn import model_selection import gc import time from talkingdata.common import csv_loader, feature_engineerer, pocket_lgb, pocket_timer timer = pocket_timer.GoldenTimer() dtypes = csv_loader.get_featured_dtypes() train = pd.read_csv(TRAIN_DATA, dtype=dtypes) train = train[feature_engineerer.get_necessary_col()] print(train.info()) train_y = train["is_attributed"] train_x = train.drop("is_attributed", axis=1) X_train, X_valid, y_train, y_valid = model_selection.train_test_split(train_x, train_y, test_size=0.2, random_state=99) timer.time("prepare train in ") lgb = pocket_lgb.GoldenLgb() model = lgb.do_train_sk(X_train, X_valid, y_train, y_valid) lgb.show_feature_importance(model) timer.time("end train in ") del train, X_train, X_valid, y_train, y_valid gc.collect()
OUTPUT_DIR = os.path.join(APP_ROOT, "output") TRAIN_DATA = os.path.join(OUTPUT_DIR, "short_train_day9.csv") TEST_DATA = os.path.join(OUTPUT_DIR, "short_merged_test_vanilla.csv") OUTPUT_FILE = os.path.join(OUTPUT_DIR, "submission_merged.csv") import pandas as pd import numpy as np from sklearn import model_selection import gc from dask import dataframe as dd from talkingdata.common import csv_loader, feature_engineerer, pocket_lgb, pocket_timer, pocket_logger logger = pocket_logger.get_my_logger() timer = pocket_timer.GoldenTimer(logger) use_col = feature_engineerer.get_necessary_col() dtypes = csv_loader.get_featured_dtypes() train = dd.read_csv(TRAIN_DATA, dtype=dtypes, usecols=use_col).compute() print(train.info()) train_y = train["is_attributed"] train_x = train.drop("is_attributed", axis=1) X_train, X_valid, y_train, y_valid = model_selection.train_test_split( train_x, train_y, test_size=0.2, random_state=99) timer.time("prepare train in ") lgb = pocket_lgb.GoldenLgb() model = lgb.do_train_sk(X_train, X_valid, y_train, y_valid) lgb.show_feature_importance(model) timer.time("end train in ") del train, X_train, X_valid, y_train, y_valid
OUTPUT_DIR = os.path.join(APP_ROOT, "output") TRAIN_DATA = os.path.join(OUTPUT_DIR, "full_train_day3_featured.csv") import pandas as pd import numpy as np import gc from sklearn import model_selection from talkingdata.common import csv_loader, feature_engineerer, holdout_validator, pocket_lgb, pocket_timer timer = pocket_timer.GoldenTimer() dtypes = csv_loader.get_featured_dtypes() #num_row = 1000 * 100 #train = pd.read_csv(TRAIN_DATA, dtype=dtypes, nrows=num_row) input_df = pd.read_csv(TRAIN_DATA, dtype=dtypes) print(input_df.info()) input_df = input_df[feature_engineerer.get_necessary_col()] split_number = 5 skf = model_selection.KFold(n_splits=split_number) lgb = pocket_lgb.GoldenLgb() first_model = None total_score = 0 for train_index, test_index in skf.split(input_df): train_np = input_df.iloc[train_index] test_np = input_df.iloc[test_index] train_df = pd.DataFrame(train_np) test_df = pd.DataFrame(test_np) model = lgb.do_train(train_df, test_df) score = model.best_score["valid_0"]["auc"] total_score += score