def __init__(self, model_dir: str, verbose: bool): if verbose: logger = get_logger() else: logger = EmptyLogger() with trace("load csv"): self.df_question = pd.read_csv( "../input/riiid-test-answer-prediction/questions.csv", dtype={ "bundle_id": "int32", "question_id": "int32", "correct_answer": "int8", "part": "int8" }) self.df_lecture = pd.read_csv( "../input/riiid-test-answer-prediction/lectures.csv", dtype={ "lecture_id": "int32", "tag": "int16", "part": "int8" }) # params with trace("load model1"): with open(f"{model_dir}/transformer_param.json", "r") as f: self.params = json.load(f) # model loading model_path = f"{model_dir}/transformers.pth" self.model = SAKTModel(13938, embed_dim=self.params["embed_dim"], max_seq=self.params["max_seq"], cont_emb=8) with trace("model.load_state_dict"): self.model.load_state_dict(torch.load(model_path)) with trace("model.to(cuda)"): self.model.to(device) with trace("load model4"): self.model.eval() with trace("load seq"): # load feature_factory_manager ff_manager_path_for_transformer = f"{model_dir}/feature_factory_manager_for_transformer.pickle" with open(ff_manager_path_for_transformer, "rb") as f: self.feature_factory_manager_for_transformer = pickle.load(f) self.feature_factory_manager_for_transformer.logger = logger with trace("load manager"): ff_manager_path = f"{model_dir}/feature_factory_manager.pickle" with open(ff_manager_path, "rb") as f: self.feature_factory_manager = pickle.load(f) self.feature_factory_manager.logger = logger with trace("gc collect"): gc.collect()
def __init__(self, model_dir: str, verbose: bool = False): if verbose: self.logger = get_logger() else: self.logger = EmptyLogger() self.df_question = pd.read_csv( "../input/riiid-test-answer-prediction/questions.csv", dtype={ "bundle_id": "int32", "question_id": "int32", "correct_answer": "int8", "part": "int8" }) self.df_lecture = pd.read_csv( "../input/riiid-test-answer-prediction/lectures.csv", dtype={ "lecture_id": "int32", "tag": "int16", "part": "int8" }) self.part_dict = {} # key: (content_id, content_type_id), value: part for x in self.df_question[["question_id", "part"]].values: question_id = x[0] part = x[1] self.part_dict[(question_id, 0)] = part for x in self.df_lecture[["lecture_id", "part"]].values: lecture_id = x[0] part = x[1] self.part_dict[(lecture_id, 1)] = part # params with open(f"{model_dir}/transformer_param.json", "r") as f: self.params = json.load(f) # model loading model_path = f"{model_dir}/transformers.pth" self.model = SAKTModel(13938, embed_dim=self.params["embed_dim"], max_seq=self.params["max_seq"], cont_emb=8) self.model.load_state_dict(torch.load(model_path)) self.model.to(device) self.model.eval() # load feature_factory_manager ff_manager_path_for_transformer = f"{model_dir}/feature_factory_manager_for_transformer.pickle" with open(ff_manager_path_for_transformer, "rb") as f: self.feature_factory_manager_for_transformer = pickle.load(f) self.feature_factory_manager_for_transformer.logger = self.logger ff_manager_path = f"{model_dir}/feature_factory_manager.pickle" with open(ff_manager_path, "rb") as f: self.feature_factory_manager = pickle.load(f) self.feature_factory_manager.logger = self.logger
class KurupicalModel: def __init__(self, model_dir: str): logger = EmptyLogger() self.df_question = pd.read_csv( "../input/riiid-test-answer-prediction/questions.csv", dtype={ "bundle_id": "int32", "question_id": "int32", "correct_answer": "int8", "part": "int8" }) self.df_lecture = pd.read_csv( "../input/riiid-test-answer-prediction/lectures.csv", dtype={ "lecture_id": "int32", "tag": "int16", "part": "int8" }) # params with open(f"{model_dir}/transformer_param.json", "r") as f: self.params = json.load(f) # model loading model_path = f"{model_dir}/transformers.pth" self.model = SAKTModel(13938, embed_dim=self.params["embed_dim"], max_seq=self.params["max_seq"], cont_emb=16) self.model.load_state_dict(torch.load(model_path)) self.model.to(device) self.model.eval() # load feature_factory_manager ff_manager_path_for_transformer = f"{model_dir}/feature_factory_manager_for_transformer.pickle" with open(ff_manager_path_for_transformer, "rb") as f: self.feature_factory_manager_for_transformer = pickle.load(f) self.feature_factory_manager_for_transformer.logger = logger ff_manager_path = f"{model_dir}/feature_factory_manager.pickle" with open(ff_manager_path, "rb") as f: self.feature_factory_manager = pickle.load(f) self.feature_factory_manager.logger = logger def update(self, df_test_prev): """ 辞書のupdate 前回のdf_testをそのまま与える """ df_test_prev = df_test_prev.copy() answered_correctly = df_test_prev.iloc[0][ "prior_group_answers_correct"] user_answer = df_test_prev.iloc[0]["prior_group_responses"] answered_correctly = \ [int(x) for x in answered_correctly.replace("[", "").replace("'", "").replace("]", "").replace(" ", "").split(",")] user_answer = \ [int(x) for x in user_answer.replace("[", "").replace("'", "").replace("]", "").replace(" ", "").split(",")] df_test_prev["answered_correctly"] = answered_correctly df_test_prev["user_answer"] = user_answer df_test_prev["answered_correctly"] = df_test_prev[ "answered_correctly"].replace(-1, np.nan) df_test_prev["prior_question_had_explanation"] = df_test_prev[ "prior_question_had_explanation"].fillna(-1).astype("int8") self.feature_factory_manager.fit(df_test_prev) df_test_prev["answered_correctly"] = df_test_prev[ "answered_correctly"].replace(np.nan, -1) self.feature_factory_manager_for_transformer.fit(df_test_prev) def predict(self, df_test): """ 予測 df_testをそのまま与える return: np.array(batch_size) """ w_df1 = pd.merge(df_test[df_test["content_type_id"] == 0], self.df_question, how="left", left_on="content_id", right_on="question_id") w_df2 = pd.merge(df_test[df_test["content_type_id"] == 1], self.df_lecture, how="left", left_on="content_id", right_on="lecture_id") df_test = pd.concat([w_df1, w_df2]).sort_values(["user_id", "timestamp"]).sort_index() df_test["tag"] = df_test["tag"].fillna(-1) df_test["correct_answer"] = df_test["correct_answer"].fillna(-1) df_test["bundle_id"] = df_test["bundle_id"].fillna(-1) df_test["task_container_id_bin300"] = [ x if x < 300 else 300 for x in df_test["task_container_id"].values ] df_test["prior_question_had_explanation"] = df_test[ "prior_question_had_explanation"].astype("float16").fillna( -1).astype("int8") df_test = self.feature_factory_manager.partial_predict(df_test) group = self.feature_factory_manager_for_transformer.partial_predict( df_test[df_test["content_type_id"] == 0]) dataset_val = SAKTDataset(group, 13939, predict_mode=True, max_seq=self.params["max_seq"]) dataloader_val = DataLoader(dataset_val, batch_size=1024, shuffle=False, num_workers=1) predicts = [] with torch.no_grad(): for item in dataloader_val: x = item["x"].to(device).long() target_id = item["target_id"].to(device).long() part = item["part"].to(device).long() label = item["label"].to(device).float() elapsed_time = item["elapsed_time"].to(device).long() duration_previous_content = item[ "duration_previous_content"].to(device).long() prior_question_had_explanation = item["prior_q"].to( device).long() user_answer = item["user_answer"].to(device).long() rate_diff = item["rate_diff"].to(device).float() container_id = item["container_id"].to(device).long() prev_ans_idx = item["previous_answer_index_content_id"].to( device).long() prev_answer_content_id = item["previous_answer_content_id"].to( device).long() output = self.model(x, target_id, part, elapsed_time, duration_previous_content, prior_question_had_explanation, user_answer, rate_diff, container_id, prev_ans_idx, prev_answer_content_id) predicts.extend(torch.nn.Sigmoid()( output[:, -1]).view(-1).data.cpu().numpy().tolist()) return np.array(predicts)
def run(debug, model_dir, update_record, kaggle=False): # environment env = riiideducation.make_env() df_question = pd.read_csv("../input/riiid-test-answer-prediction/questions.csv", dtype={"bundle_id": "int32", "question_id": "int32", "correct_answer": "int8", "part": "int8"}) df_lecture = pd.read_csv("../input/riiid-test-answer-prediction/lectures.csv", dtype={"lecture_id": "int32", "tag": "int16", "part": "int8"}) # params with open(f"{model_dir}/transformer_param.json", "r") as f: params = json.load(f) # model loading model_path = f"{model_dir}/transformers.pth" model = SAKTModel(13938, embed_dim=params["embed_dim"], max_seq=params["max_seq"], cont_emb=8) model.load_state_dict(torch.load(model_path)) model.to(device) model.eval() # load feature_factory_manager logger = get_logger() ff_manager_path_for_transformer = f"{model_dir}/feature_factory_manager_for_transformer.pickle" with open(ff_manager_path_for_transformer, "rb") as f: feature_factory_manager_for_transformer = pickle.load(f) feature_factory_manager_for_transformer.logger = logger ff_manager_path = f"{model_dir}/feature_factory_manager.pickle" with open(ff_manager_path, "rb") as f: feature_factory_manager = pickle.load(f) feature_factory_manager.logger = logger iter_test = env.iter_test() df_test_prev = [] df_test_prev_rows = 0 answered_correctlies = [] user_answers = [] i = 0 for (df_test, df_sample_prediction) in iter_test: i += 1 logger.info(f"[iteration {i}: data_length: {len(df_test)}") # 前回のデータ更新 if df_test_prev_rows > 0: # 初回のみパスするためのif answered_correctly = df_test.iloc[0]["prior_group_answers_correct"] user_answer = df_test.iloc[0]["prior_group_responses"] answered_correctlies.extend([int(x) for x in answered_correctly.replace("[", "").replace("'", "").replace("]", "").replace(" ", "").split(",")]) user_answers.extend([int(x) for x in user_answer.replace("[", "").replace("'", "").replace("]", "").replace(" ", "").split(",")]) if debug: update_record = 1 if df_test_prev_rows > update_record: logger.info("------ fitting ------") logger.info("concat df") df_test_prev = pd.concat(df_test_prev) df_test_prev["answered_correctly"] = answered_correctlies df_test_prev["user_answer"] = user_answers # df_test_prev = df_test_prev.drop(prior_columns, axis=1) # df_test_prev = df_test_prev[df_test_prev["answered_correctly"] != -1] df_test_prev["answered_correctly"] = df_test_prev["answered_correctly"].replace(-1, np.nan) df_test_prev["prior_question_had_explanation"] = df_test_prev["prior_question_had_explanation"].fillna(-1).astype("int8") logger.info("fit data") feature_factory_manager.fit(df_test_prev) df_test_prev["answered_correctly"] = df_test_prev["answered_correctly"].replace(np.nan, -1) feature_factory_manager_for_transformer.fit(df_test_prev) df_test_prev = [] df_test_prev_rows = 0 answered_correctlies = [] user_answers = [] # 今回のデータ取得&計算 # logger.info(f"[time: {int(time.time() - t)}dataload") logger.info(f"------ question&lecture merge ------") w_df1 = pd.merge(df_test[df_test["content_type_id"] == 0], df_question, how="left", left_on="content_id", right_on="question_id") w_df2 = pd.merge(df_test[df_test["content_type_id"] == 1], df_lecture, how="left", left_on="content_id", right_on="lecture_id") df_test = pd.concat([w_df1, w_df2]).sort_values(["user_id", "timestamp"]).sort_index() df_test["tag"] = df_test["tag"].fillna(-1) df_test["correct_answer"] = df_test["correct_answer"].fillna(-1) df_test["bundle_id"] = df_test["bundle_id"].fillna(-1) df_test["task_container_id_bin300"] = [x if x < 300 else 300 for x in df_test["task_container_id"].values] logger.info(f"------ transform ------ ") df_test["prior_question_had_explanation"] = df_test["prior_question_had_explanation"].astype("float16").fillna(-1).astype("int8") df_test = feature_factory_manager.partial_predict(df_test) group = feature_factory_manager_for_transformer.partial_predict(df_test[df_test["content_type_id"] == 0]) logger.info(f"------ predict ------") dataset_val = SAKTDataset(group, 13939, predict_mode=True, max_seq=params["max_seq"]) dataloader_val = DataLoader(dataset_val, batch_size=1024, shuffle=False, num_workers=1) predicts = [] with torch.no_grad(): for item in dataloader_val: x = item["x"].to(device).long() target_id = item["target_id"].to(device).long() part = item["part"].to(device).long() label = item["label"].to(device).float() elapsed_time = item["elapsed_time"].to(device).long() duration_previous_content = item["duration_previous_content"].to(device).long() prior_question_had_explanation = item["prior_q"].to(device).long() user_answer = item["user_answer"].to(device).long() rate_diff = item["rate_diff"].to(device).float() container_id = item["container_id"].to(device).long() prev_ans_idx = item["previous_answer_index_content_id"].to(device).long() prev_answer_content_id = item["previous_answer_content_id"].to(device).long() output = model(x, target_id, part, elapsed_time, duration_previous_content, prior_question_had_explanation, user_answer, rate_diff, container_id, prev_ans_idx, prev_answer_content_id) predicts.extend(torch.nn.Sigmoid()(output[:, -1]).view(-1).data.cpu().numpy().tolist()) logger.info("------ other ------") df_sample_prediction = df_test[df_test["content_type_id"] == 0][["row_id"]] df_sample_prediction["answered_correctly"] = predicts env.predict(df_sample_prediction) df_test_prev.append(df_test) df_test_prev_rows += len(df_test) if i < 5: df_test.to_csv(f"{i}.csv") if i == 3: class EmptyLogger: def __init__(self): pass def info(self, s): pass logger = EmptyLogger()
class KurupicalModel: def __init__(self, model_dir: str, verbose: bool = False): if verbose: self.logger = get_logger() else: self.logger = EmptyLogger() self.df_question = pd.read_csv( "../input/riiid-test-answer-prediction/questions.csv", dtype={ "bundle_id": "int32", "question_id": "int32", "correct_answer": "int8", "part": "int8" }) self.df_lecture = pd.read_csv( "../input/riiid-test-answer-prediction/lectures.csv", dtype={ "lecture_id": "int32", "tag": "int16", "part": "int8" }) self.part_dict = {} # key: (content_id, content_type_id), value: part for x in self.df_question[["question_id", "part"]].values: question_id = x[0] part = x[1] self.part_dict[(question_id, 0)] = part for x in self.df_lecture[["lecture_id", "part"]].values: lecture_id = x[0] part = x[1] self.part_dict[(lecture_id, 1)] = part # params with open(f"{model_dir}/transformer_param.json", "r") as f: self.params = json.load(f) # model loading model_path = f"{model_dir}/transformers.pth" self.model = SAKTModel(13938, embed_dim=self.params["embed_dim"], max_seq=self.params["max_seq"], cont_emb=8) self.model.load_state_dict(torch.load(model_path)) self.model.to(device) self.model.eval() # load feature_factory_manager ff_manager_path_for_transformer = f"{model_dir}/feature_factory_manager_for_transformer.pickle" with open(ff_manager_path_for_transformer, "rb") as f: self.feature_factory_manager_for_transformer = pickle.load(f) self.feature_factory_manager_for_transformer.logger = self.logger ff_manager_path = f"{model_dir}/feature_factory_manager.pickle" with open(ff_manager_path, "rb") as f: self.feature_factory_manager = pickle.load(f) self.feature_factory_manager.logger = self.logger def update(self, df_test_prev, df_test): """ 辞書のupdate 前回のdf_testをそのまま与える """ df_test_prev = df_test_prev.copy() answered_correctly = df_test.iloc[0]["prior_group_answers_correct"] user_answer = df_test.iloc[0]["prior_group_responses"] answered_correctly = \ [int(x) for x in answered_correctly.replace("[", "").replace("'", "").replace("]", "").replace(" ", "").split(",")] user_answer = \ [int(x) for x in user_answer.replace("[", "").replace("'", "").replace("]", "").replace(" ", "").split(",")] df_test_prev["answered_correctly"] = answered_correctly df_test_prev["user_answer"] = user_answer df_test_prev["answered_correctly"] = df_test_prev[ "answered_correctly"].replace(-1, np.nan) df_test_prev["prior_question_had_explanation"] = df_test_prev[ "prior_question_had_explanation"].fillna(-1).astype("int8") self.feature_factory_manager.fit(df_test_prev) df_test_prev["answered_correctly"] = df_test_prev[ "answered_correctly"].replace(np.nan, -1) self.feature_factory_manager_for_transformer.fit(df_test_prev) def predict(self, df_test): """ 予測 df_testをそのまま与える return: np.array(batch_size) """ self.logger.info("------------ start! ------------") df_test = df_test.copy() self.logger.info("preprocess") df_test["part"] = [ self.part_dict[(x[0], x[1])] for x in df_test[["content_id", "content_type_id"]].values ] df_test["task_container_id_bin300"] = [ x if x < 300 else 300 for x in df_test["task_container_id"].values ] df_test["prior_question_had_explanation"] = df_test[ "prior_question_had_explanation"].astype("float16").fillna( -1).astype("int8") self.logger.info("feature_factory_manager.partial_predict") df_test = self.feature_factory_manager.partial_predict(df_test) self.logger.info( "feature_factory_manager_for_transformer.partial_predict") group = self.feature_factory_manager_for_transformer.partial_predict( df_test[df_test["content_type_id"] == 0]) self.logger.info("make_dataset") dataset_val = SAKTDataset(group, 13939, predict_mode=True, max_seq=self.params["max_seq"]) self.logger.info("make_dataloader") dataloader_val = DataLoader(dataset_val, batch_size=1024, shuffle=False, num_workers=1) predicts = [] self.logger.info("no_grad_setting") with torch.no_grad(): self.logger.info("item") for item in dataloader_val: self.logger.info("data_preparing") x = item["x"].to(device).long() target_id = item["target_id"].to(device).long() part = item["part"].to(device).long() label = item["label"].to(device).float() elapsed_time = item["elapsed_time"].to(device).long() duration_previous_content = item[ "duration_previous_content"].to(device).long() prior_question_had_explanation = item["prior_q"].to( device).long() user_answer = item["user_answer"].to(device).long() rate_diff = item["rate_diff"].to(device).float() container_id = item["container_id"].to(device).long() prev_ans_idx = item["previous_answer_index_content_id"].to( device).long() prev_answer_content_id = item["previous_answer_content_id"].to( device).long() self.logger.info("predict") output = self.model(x, target_id, part, elapsed_time, duration_previous_content, prior_question_had_explanation, user_answer, rate_diff, container_id, prev_ans_idx, prev_answer_content_id) self.logger.info("postprocessing") predicts.extend(torch.nn.Sigmoid()( output[:, -1]).view(-1).data.cpu().numpy().tolist()) return np.array(predicts), df_test