def __init__(self, model, filename=None, num_rows=None):
        self.logger = pocket_logger.get_my_logger()
        self.model = model

        use_col = feature_engineerer.get_necessary_col()
        dtypes = csv_loader.get_featured_dtypes()
        local_filename = HOLDOUT_DATA
        if filename is None:
            filename = HOLDOUT_DATA
        if num_rows is None:
            self.holdout_df = dd.read_csv(filename, dtype=dtypes, usecols=use_col).compute()
        else:
            self.holdout_df = dd.read_csv(filename, dtype=dtypes, nrows=num_rows, usecols=use_col).compute()

        print(self.holdout_df.info())
        print("Initialized validator.")
Exemple #2
0
TRAIN_DATA = os.path.join(OUTPUT_DIR, "full_train_day3_featured.csv")
TEST_DATA = os.path.join(OUTPUT_DIR, "full_test_featured.csv")
OUTPUT_FILE = os.path.join(OUTPUT_DIR, "submission_full.csv")

import pandas as pd
import numpy as np
from sklearn import model_selection
import gc
import time
from talkingdata.common import csv_loader, feature_engineerer, pocket_lgb, pocket_timer

timer = pocket_timer.GoldenTimer()
dtypes = csv_loader.get_featured_dtypes()
train = pd.read_csv(TRAIN_DATA, dtype=dtypes)

train = train[feature_engineerer.get_necessary_col()]
print(train.info())

train_y = train["is_attributed"]
train_x = train.drop("is_attributed", axis=1)

X_train, X_valid, y_train, y_valid = model_selection.train_test_split(train_x, train_y, test_size=0.2, random_state=99)
timer.time("prepare train in ")

lgb = pocket_lgb.GoldenLgb()
model = lgb.do_train_sk(X_train, X_valid, y_train, y_valid)
lgb.show_feature_importance(model)
timer.time("end train in ")
del train, X_train, X_valid, y_train, y_valid
gc.collect()
OUTPUT_DIR = os.path.join(APP_ROOT, "output")
TRAIN_DATA = os.path.join(OUTPUT_DIR, "short_train_day9.csv")
TEST_DATA = os.path.join(OUTPUT_DIR, "short_merged_test_vanilla.csv")
OUTPUT_FILE = os.path.join(OUTPUT_DIR, "submission_merged.csv")

import pandas as pd
import numpy as np
from sklearn import model_selection
import gc
from dask import dataframe as dd
from talkingdata.common import csv_loader, feature_engineerer, pocket_lgb, pocket_timer, pocket_logger

logger = pocket_logger.get_my_logger()
timer = pocket_timer.GoldenTimer(logger)

use_col = feature_engineerer.get_necessary_col()
dtypes = csv_loader.get_featured_dtypes()
train = dd.read_csv(TRAIN_DATA, dtype=dtypes, usecols=use_col).compute()
print(train.info())

train_y = train["is_attributed"]
train_x = train.drop("is_attributed", axis=1)
X_train, X_valid, y_train, y_valid = model_selection.train_test_split(
    train_x, train_y, test_size=0.2, random_state=99)
timer.time("prepare train in ")

lgb = pocket_lgb.GoldenLgb()
model = lgb.do_train_sk(X_train, X_valid, y_train, y_valid)
lgb.show_feature_importance(model)
timer.time("end train in ")
del train, X_train, X_valid, y_train, y_valid
OUTPUT_DIR = os.path.join(APP_ROOT, "output")
TRAIN_DATA = os.path.join(OUTPUT_DIR, "full_train_day3_featured.csv")

import pandas as pd
import numpy as np
import gc
from sklearn import model_selection
from talkingdata.common import csv_loader, feature_engineerer, holdout_validator, pocket_lgb, pocket_timer

timer = pocket_timer.GoldenTimer()
dtypes = csv_loader.get_featured_dtypes()
#num_row = 1000 * 100
#train = pd.read_csv(TRAIN_DATA, dtype=dtypes, nrows=num_row)
input_df = pd.read_csv(TRAIN_DATA, dtype=dtypes)
print(input_df.info())
input_df = input_df[feature_engineerer.get_necessary_col()]

split_number = 5
skf = model_selection.KFold(n_splits=split_number)
lgb = pocket_lgb.GoldenLgb()
first_model = None
total_score = 0
for train_index, test_index in skf.split(input_df):
    train_np = input_df.iloc[train_index]
    test_np = input_df.iloc[test_index]
    train_df = pd.DataFrame(train_np)
    test_df = pd.DataFrame(test_np)

    model = lgb.do_train(train_df, test_df)
    score = model.best_score["valid_0"]["auc"]
    total_score += score