import os from riiid.config import INPUT_PATH from riiid.core.data import DataLoader, save_pkl, load_pkl from riiid.validation import generate_test loader = DataLoader(INPUT_PATH) train, _, _ = loader.load() test = generate_test(train, size=2_500_000, N=10_000, seed=0) save_pkl(test, os.path.join(INPUT_PATH, 'tests_0.pkl')) test = generate_test(train, size=5_000_000, N=20_000, seed=0) save_pkl(test, os.path.join(INPUT_PATH, 'tests_1.pkl'))
from riiid.core.data import DataLoader, preprocess_questions, preprocess_lectures from riiid.core.model import RiiidModel from riiid.validation import merge_test from riiid.config import PARAMS from riiid import cache from riiid.aws.cache import S3CacheManager from riiid.aws.config import CONTEXT CONTEXT.get_logger() try: cache.CACHE_MANAGER = S3CacheManager('kaggle-riiid-cache') loader = DataLoader(CONTEXT.data_path()) train, questions, lectures = loader.load() questions = preprocess_questions(questions) lectures = preprocess_lectures(lectures) test = loader.load_tests('tests_0.pkl') train = merge_test(train, test) del test PARAMS['question_embedding']['workers'] = 32 PARAMS['answers_embedding']['workers'] = 32 model = RiiidModel(questions, lectures, params=PARAMS) X, y, train, valid = model.fit_transform(train) bucket = S3Bucket(model.get_normalized_name()) logging.info('Saving data')