def get_data(n=1000): loader = DataLoader(INPUT_PATH) train, questions, lectures = loader.load_first_users(n) questions = preprocess_questions(questions) lectures = preprocess_lectures(lectures) test = loader.load_tests('tests_1.pkl') return train, questions, lectures, test
def score_params(params, n_users=30000): loader = DataLoader(CONTEXT.data_path()) train, questions, lectures = loader.load_first_users(n_users) questions = preprocess_questions(questions) lectures = preprocess_lectures(lectures) test = loader.load_tests('tests_0.pkl') train = merge_test(train, test) del test model = RiiidModel(questions, lectures, params) X, y, train, valid = model.fit_transform(train) model.fit_lgbm(X[train], y[train], X[valid], y[valid]) return model.best_score, model.best_iteration
import time import logging from riiid.core.data import DataLoader from riiid.saint.model import SaintModel from riiid.utils import configure_console_logging from riiid.config import INPUT_PATH, MODELS_PATH configure_console_logging() logging.info('Loading model') MODEL_ID = 'saint_20210104_024103' model: SaintModel = SaintModel.load(MODELS_PATH, MODEL_ID) tests = DataLoader(INPUT_PATH).load_tests_examples() for i, test in enumerate(tests): if model.test_batch == 1: start = time.perf_counter() test = model.update(test) _, predictions = model.predict(test) end = time.perf_counter() total = end - start logging.info('Time spent: {:.1f}s ({:.3f}s by batch)'.format(total, total / model.test_batch))
import os from riiid.config import INPUT_PATH from riiid.core.data import DataLoader, save_pkl, load_pkl from riiid.validation import generate_test loader = DataLoader(INPUT_PATH) train, _, _ = loader.load() test = generate_test(train, size=2_500_000, N=10_000, seed=0) save_pkl(test, os.path.join(INPUT_PATH, 'tests_0.pkl')) test = generate_test(train, size=5_000_000, N=20_000, seed=0) save_pkl(test, os.path.join(INPUT_PATH, 'tests_1.pkl'))
import os from riiid.core.data import DataLoader, save_pkl from riiid.saint.model import SaintModel from riiid.utils import configure_console_logging from riiid.config import INPUT_PATH, MODELS_PATH configure_console_logging() # Load data loader = DataLoader(INPUT_PATH) train, questions, lectures = loader.load_first_users(30000) # Compute features model = SaintModel(questions, lectures) train = model.fit_transform(train) # Create train and validation datasets train, test = model.split_train_test(train) train = model.create_features(train) test = model.create_features(test) X_train, y_train = model.create_dataset(train) X_test, y_test = model.create_dataset(test) save_pkl((X_train, y_train, X_test, y_test), os.path.join(MODELS_PATH, model.get_name('data.pkl'))) # Fit model model.fit(X_train, y_train, X_test, y_test) model.score(X_test, y_test) # Save model model.save(MODELS_PATH)
import os from riiid.core.data import DataLoader, preprocess_questions, preprocess_lectures, save_pkl from riiid.core.model import RiiidModel from riiid.validation import merge_test from riiid.utils import configure_console_logging from riiid.config import INPUT_PATH, MODELS_PATH, PARAMS configure_console_logging() # Load and preprocess data loader = DataLoader(INPUT_PATH) train, questions, lectures = loader.load_first_users(30000) questions = preprocess_questions(questions) lectures = preprocess_lectures(lectures) # Load and merge validation set test = loader.load_tests('tests_0.pkl') train = merge_test(train, test) # Compute features model = RiiidModel(questions, lectures, params=PARAMS) X, y, train, valid = model.fit_transform(train) save_pkl((X, y, train, valid), path=os.path.join(MODELS_PATH, model.get_name('data.pkl'))) # Fit models model.fit_lgbm(X[train], y[train], X[valid], y[valid]) model.fit_catboost(X[train], y[train], X[valid], y[valid]) model.fit_neural(X[train], y[train], X[valid], y[valid]) model.fit_blender(X[valid], y[valid])
from doppel.aws.s3 import S3Bucket from riiid.core.data import DataLoader, preprocess_questions, preprocess_lectures from riiid.core.model import RiiidModel from riiid.validation import merge_test from riiid.config import PARAMS from riiid import cache from riiid.aws.cache import S3CacheManager from riiid.aws.config import CONTEXT CONTEXT.get_logger() try: cache.CACHE_MANAGER = S3CacheManager('kaggle-riiid-cache') loader = DataLoader(CONTEXT.data_path()) train, questions, lectures = loader.load() questions = preprocess_questions(questions) lectures = preprocess_lectures(lectures) test = loader.load_tests('tests_0.pkl') train = merge_test(train, test) del test PARAMS['question_embedding']['workers'] = 32 PARAMS['answers_embedding']['workers'] = 32 model = RiiidModel(questions, lectures, params=PARAMS) X, y, train, valid = model.fit_transform(train) bucket = S3Bucket(model.get_normalized_name())
import logging from doppel import terminate from doppel.aws.s3 import S3Bucket from riiid.core.data import DataLoader from riiid.saint.model import SaintModel from riiid.aws.config import CONTEXT CONTEXT.get_logger() try: loader = DataLoader(CONTEXT.data_path()) train, questions, lectures = loader.load() model = SaintModel(questions, lectures) train = model.fit_transform(train) train, test = model.split_train_test(train) train = model.create_features(train) test = model.create_features(test) X_train, y_train = model.create_dataset(train) X_test, y_test = model.create_dataset(test) bucket = S3Bucket(model.get_normalized_name()) logging.info('Saving model') bucket.save_pickle(model, model.get_name(ext='pkl')) logging.info('Saving data') bucket.save_pickle_multiparts((X_train, y_train, X_test, y_test), model.get_name('data.pkl'))