def score_params(params, n_users=30000): loader = DataLoader(CONTEXT.data_path()) train, questions, lectures = loader.load_first_users(n_users) questions = preprocess_questions(questions) lectures = preprocess_lectures(lectures) test = loader.load_tests('tests_0.pkl') train = merge_test(train, test) del test model = RiiidModel(questions, lectures, params) X, y, train, valid = model.fit_transform(train) model.fit_lgbm(X[train], y[train], X[valid], y[valid]) return model.best_score, model.best_iteration
def predict(self, X): if self.test_batch % 1 == 0: logging.info('Running test batch {}'.format(self.test_batch)) X = self.lectures_pipeline.transform(X) X = RiiidModel.remove_lectures(X) if len(X) > 0: predictions = X[['row_id']].copy() X = self.pipeline.transform(X) self._update_context_with_priors(X) inputs = self._create_prediction_data(X) self._roll_context_on_priors(X) self._update_context(X, self.independent_features) predictions['answered_correctly'] = self.model.predict(inputs)[:, -1, -1] else: predictions = pd.DataFrame(columns=['row_id', 'answered_correctly']) self.test_batch += 1 return X, predictions
def fit_transform(self, X): logging.info('- Fit') self._init_fit(X) self.lectures_pipeline = make_pipeline( LecturesTransformer(self.lectures) ) X = self.lectures_pipeline.fit_transform(X) X = RiiidModel.remove_lectures(X) cv = self._build_cv(X) self.pipeline = make_pipeline( ScoreEncoder('content_id', cv=cv, smoothing_min=5, smoothing_value=1, noise=0.005), QuestionsTransformer(self.questions, time_bins=self.time_bins, lag_bins=self.lag_bins) ) X = self.pipeline.fit_transform(X) self._create_context(X) return X
def update(self, test): prior_user_answer = eval(test['prior_group_responses'].values[0]) prior_answered_correctly = eval(test['prior_group_answers_correct'].values[0]) test = test.drop(columns=['prior_group_answers_correct', 'prior_group_responses']) if self.previous_test is not None: self.previous_test['user_answer'] = prior_user_answer self.previous_test['answered_correctly'] = prior_answered_correctly X = self.previous_test # X = update_pipeline(self.lectures_pipeline, X) # Not required X = RiiidModel.remove_lectures(X) if len(X) > 0: y = X['answered_correctly'] X = update_pipeline(self.pipeline, X, y) self._update_context(X, self.dependent_features) self.previous_test = test.copy() return test
def generate_reference_and_validation_datasets(n=1000, validation_ratio=0.5): # Reference data train, questions, lectures, test = get_data(n) train_reference = merge_test(train, test) model = RiiidModel(questions, lectures, PARAMS) X_reference, *_ = model.fit_transform(train_reference) model.save(os.path.join(TEST_PATH, 'model_ref.zip')) # Compare data train, questions, lectures, test = get_data(n) train_compare, validation = merge_test(train, test, validation_ratio=validation_ratio) model = RiiidModel(questions, lectures, PARAMS) X_compare, y, train, valid = model.fit_transform(train_compare) model.fit_lgbm(X_compare[train], y[train], X_compare[valid], y[valid]) # Loading model model.save(os.path.join(TEST_PATH, 'model_test.zip')) model: RiiidModel = RiiidModel.load(os.path.join(TEST_PATH, 'model_test.zip')) X_validation = [] for test in validation: test = model.update(test) X, predictions = model.predict(test) if len(X) > 0: X_validation.append(X) validation = pd.concat(validation) X_validation = pd.concat(X_validation) data = (train_reference, X_reference, validation, X_validation) return data
import riiideducation env = riiideducation.make_env() iter_test = env.iter_test() import os import sys import logging PATH = '/kaggle/input/riiid-submission' sys.path.append(PATH) from riiid.utils import configure_console_logging, check_versions from riiid.core.model import RiiidModel configure_console_logging() check_versions() logging.info('Load model') model = RiiidModel.load(os.path.join(PATH, 'model')) for test, _ in iter_test: test = model.update(test) _, predictions = model.predict(test) env.predict(predictions)
from riiid.validation import merge_test from riiid.utils import configure_console_logging from riiid.config import INPUT_PATH, MODELS_PATH, PARAMS configure_console_logging() # Load and preprocess data loader = DataLoader(INPUT_PATH) train, questions, lectures = loader.load_first_users(30000) questions = preprocess_questions(questions) lectures = preprocess_lectures(lectures) # Load and merge validation set test = loader.load_tests('tests_0.pkl') train = merge_test(train, test) # Compute features model = RiiidModel(questions, lectures, params=PARAMS) X, y, train, valid = model.fit_transform(train) save_pkl((X, y, train, valid), path=os.path.join(MODELS_PATH, model.get_name('data.pkl'))) # Fit models model.fit_lgbm(X[train], y[train], X[valid], y[valid]) model.fit_catboost(X[train], y[train], X[valid], y[valid]) model.fit_neural(X[train], y[train], X[valid], y[valid]) model.fit_blender(X[valid], y[valid]) # Save model model.save(os.path.join(MODELS_PATH, model.get_name()))
try: cache.CACHE_MANAGER = S3CacheManager('kaggle-riiid-cache') loader = DataLoader(CONTEXT.data_path()) train, questions, lectures = loader.load() questions = preprocess_questions(questions) lectures = preprocess_lectures(lectures) test = loader.load_tests('tests_0.pkl') train = merge_test(train, test) del test PARAMS['question_embedding']['workers'] = 32 PARAMS['answers_embedding']['workers'] = 32 model = RiiidModel(questions, lectures, params=PARAMS) X, y, train, valid = model.fit_transform(train) bucket = S3Bucket(model.get_normalized_name()) logging.info('Saving data') for data, name in [(X, 'X'), (y, 'y'), (train, 'train'), (valid, 'valid')]: bucket.save_pickle_multiparts(data, name + '.pkl') model.fit_lgbm(X[train], y[train], X[valid], y[valid]) model.fit_catboost(X[train], y[train], X[valid], y[valid]) logging.info('Saving model') bucket.save_multiparts(model.save_with_source(), model.get_name()) except Exception as e:
import os import time import logging from riiid.core.data import DataLoader from riiid.utils import configure_console_logging from riiid.config import MODELS_PATH, INPUT_PATH from riiid.core.model import RiiidModel configure_console_logging() logging.info('Loading model') MODEL_NAME = 'model_20210123_210542.zip' model: RiiidModel = RiiidModel.load(os.path.join(MODELS_PATH, MODEL_NAME)) tests = DataLoader(INPUT_PATH).load_tests_examples() for i, test in enumerate(tests): if model.test_batch == 1: start = time.perf_counter() test = model.update(test) _, predictions = model.predict(test) end = time.perf_counter() total = end - start logging.info('Time spent: {:.1f}s ({:.3f}s by batch)'.format(total, total / model.test_batch))