Esempio n. 1
0
def inference(TARGET, FEATURES, prior_question_elapsed_time_mean, features_dicts, train_mean_dict, scaler):
    
    net = keras.models.load_model('model.h5', compile=False)
    
    # Get feature dict
    answered_correctly_u_count = features_dicts['answered_correctly_u_count']
    answered_correctly_u_sum = features_dicts['answered_correctly_u_sum']
    elapsed_time_u_sum = features_dicts['elapsed_time_u_sum']
    explanation_u_sum = features_dicts['explanation_u_sum']
    answered_correctly_q_count = features_dicts['answered_correctly_q_count']
    answered_correctly_q_sum = features_dicts['answered_correctly_q_sum']
#     answered_correctly_uq = features_dicts["answered_correctly_uq"]
    elapsed_time_q_sum = features_dicts['elapsed_time_q_sum']
    explanation_q_sum = features_dicts['explanation_q_sum']
    timestamp_u = features_dicts['timestamp_u']
    timestamp_u_incorrect = features_dicts['timestamp_u_incorrect']
    answered_correctly_up_count = features_dicts['answered_correctly_up_count']
    answered_correctly_up_sum = features_dicts['answered_correctly_up_sum']
       
    # Get api iterator and predictor
    env = riiideducation.make_env()
    iter_test = env.iter_test()
    set_predict = env.predict
    questions_df = pd.read_pickle('questions_df.pkl')
    previous_test_df = None
    for (test_df, sample_prediction_df) in iter_test:
        if previous_test_df is not None:
            previous_test_df[TARGET] = eval(test_df["prior_group_answers_correct"].iloc[0])
            update_features(previous_test_df, answered_correctly_u_sum,answered_correctly_u_count, answered_correctly_q_sum, answered_correctly_q_count,timestamp_u_incorrect, answered_correctly_up_count, answered_correctly_up_sum)
        test_df['prior_question_had_explanation'] = test_df.prior_question_had_explanation.fillna(False).astype('int8')
        test_df['prior_question_elapsed_time'].fillna(prior_question_elapsed_time_mean, inplace = True)
        question_cols = ['question_id','part','tag_1','answered_correctly_q_mean','answered_correctly_q_std','answered_correctly_p_mean','answered_correctly_p_std','answered_correctly_b_mean','answered_correctly_b_std','answered_correctly_tag_1_mean','answered_correctly_tag_1_std']
        test_df = pd.merge(test_df, questions_df[question_cols], left_on = 'content_id', right_on = 'question_id', how = 'left')
        previous_test_df = test_df.copy()
        test_df = test_df[test_df['content_type_id'] == 0].reset_index(drop = True)
        test_df[TARGET] = 0
        test_df = add_features(test_df, answered_correctly_u_count, answered_correctly_u_sum, elapsed_time_u_sum, explanation_u_sum, timestamp_u, timestamp_u_incorrect, answered_correctly_q_count, answered_correctly_q_sum, elapsed_time_q_sum, explanation_q_sum, answered_correctly_up_count, answered_correctly_up_sum, update=False)
        for col in test_df[FEATURES].columns:
            test_df[col].fillna(train_mean_dict[col],inplace=True)
        X_test = test_df[FEATURES].values
        X_test = scaler.transform(X_test)
        test_df[TARGET] = net.predict(X_test)
        set_predict(test_df[['row_id', TARGET]])
        
    print('Job Done')
Esempio n. 2
0
def run(model_dir,
        verbose=False):
    model = KurupicalModel(model_dir=model_dir, verbose=verbose)
    # environment
    env = riiideducation.make_env()

    logger = get_logger()
    iter_test = env.iter_test()
    df_test_prev = None
    for (df_test, df_sample_prediction) in iter_test:
        if verbose: logger.info("inference!")
        if df_test_prev is not None:
            model.update(df_test_prev, df_test)

        predicts, df_test_prev = model.predict(df_test)

        df_sample_prediction = df_test[df_test["content_type_id"] == 0][["row_id"]]
        df_sample_prediction["answered_correctly"] = predicts
        env.predict(df_sample_prediction)
Esempio n. 3
0
def run(debug, model_dir, kaggle=False):

    if kaggle:
        files_dir = "/kaggle/input/riiid-split10/*.pickle"
    else:
        files_dir = "../input/riiid-test-answer-prediction/split10_base/*.pickle"

    logger = get_logger()
    # environment
    env = riiideducation.make_env()

    df_question = pd.read_csv(
        "../input/riiid-test-answer-prediction/questions.csv",
        dtype={
            "bundle_id": "int32",
            "question_id": "int32",
            "correct_answer": "int8",
            "part": "int8"
        })
    df_lecture = pd.read_csv(
        "../input/riiid-test-answer-prediction/lectures.csv",
        dtype={
            "lecture_id": "int32",
            "tag": "int16",
            "part": "int8"
        })
    # model loading
    models = []
    for model_path in glob.glob(f"{model_dir}/*model*.pickle"):
        with open(model_path, "rb") as f:
            models.append(pickle.load(f))

    # data preprocessing
    logger = get_logger()
    feature_factory_dict = {}
    feature_factory_dict["tags"] = {"TagsSeparator": TagsSeparator()}
    for column in [
            "content_id", "user_id", "content_type_id",
            "prior_question_had_explanation", "tags1", "tags2", "tags3",
            "tags4", "tags5", "tags6", ("user_id", "content_type_id"),
        ("user_id", "prior_question_had_explanation")
    ]:
        is_partial_fit = column == "content_id"
        is_onebyone = "content_id" in column
        if type(column) == str:
            feature_factory_dict[column] = {
                "CountEncoder":
                CountEncoder(column=column),
                "TargetEncoder":
                TargetEncoder(column=column, is_partial_fit=is_partial_fit)
            }
        else:
            feature_factory_dict[column] = {
                "CountEncoder":
                CountEncoder(column=list(column)),
                "TargetEncoder":
                TargetEncoder(column=list(column),
                              is_partial_fit=is_partial_fit)
            }

    for column in [
            "part", ("user_id", "tag"), ("user_id", "part"),
        ("content_type_id", "part"), ("user_id", "content_id")
    ]:
        if type(column) == str:
            feature_factory_dict[column] = {
                "CountEncoder": CountEncoder(column=column)
            }
        else:
            feature_factory_dict[column] = {
                "CountEncoder": CountEncoder(column=list(column))
            }

    feature_factory_dict["user_id"][
        "MeanAggregatorTimestamp"] = MeanAggregator(column="user_id",
                                                    agg_column="timestamp",
                                                    remove_now=False)
    feature_factory_dict["user_id"][
        "MeanAggregatorPriorQuestionElapsedTime"] = MeanAggregator(
            column="user_id",
            agg_column="prior_question_elapsed_time",
            remove_now=True)
    feature_factory_dict["user_id"]["ShiftDiffEncoder"] = ShiftDiffEncoder(
        groupby="user_id", column="timestamp")
    feature_factory_dict["content_id"][
        "MeanAggregatorPriorQuestionElapsedTime"] = MeanAggregator(
            column="content_id",
            agg_column="prior_question_elapsed_time",
            remove_now=True)

    feature_factory_manager = FeatureFactoryManager(
        feature_factory_dict=feature_factory_dict, logger=logger)

    for model_id, fname in enumerate(glob.glob(files_dir)):
        logger.info(f"loading... {fname}")
        df = pd.read_pickle(fname)
        df["answered_correctly"] = df["answered_correctly"].replace(-1, np.nan)
        df["prior_question_had_explanation"] = df[
            "prior_question_had_explanation"].fillna(-1).astype("int8")
        if debug:
            df = df.head(1000)
        df = pd.concat([
            pd.merge(df[df["content_type_id"] == 0],
                     df_question,
                     how="left",
                     left_on="content_id",
                     right_on="question_id"),
            pd.merge(df[df["content_type_id"] == 1],
                     df_lecture,
                     how="left",
                     left_on="content_id",
                     right_on="lecture_id")
        ]).sort_values(["user_id", "timestamp"])
        feature_factory_manager.fit(df, is_first_fit=True)

    iter_test = env.iter_test()
    df_test_prev = pd.DataFrame()
    df_test_prev1 = pd.DataFrame()
    answered_correctlies = []
    user_answers = []
    i = 0
    t = time.time()
    for (df_test, df_sample_prediction) in iter_test:
        i += 1
        logger.info(
            f"[time: {int(time.time() - t)}iteration {i}: data_length: {len(df_test)}"
        )
        # 前回のデータ更新
        if len(df_test_prev) > 0:  # 初回のみパスするためのif
            answered_correctly = df_test.iloc[0]["prior_group_answers_correct"]
            user_answer = df_test.iloc[0]["prior_group_responses"]
            answered_correctlies.extend([
                int(x) for x in answered_correctly.replace("[", "").replace(
                    "'", "").replace("]", "").replace(" ", "").split(",")
            ])
            user_answers.extend([
                int(x)
                for x in user_answer.replace("[", "").replace("'", "").replace(
                    "]", "").replace(" ", "").split(",")
            ])

        if debug:
            update_record = 1
        else:
            update_record = 150
        if len(df_test_prev) > update_record:
            df_test_prev["answered_correctly"] = answered_correctlies
            df_test_prev["user_answer"] = user_answers
            # df_test_prev = df_test_prev.drop(prior_columns, axis=1)
            df_test_prev = df_test_prev[
                df_test_prev["answered_correctly"] != -1]
            df_test_prev["answered_correctly"] = df_test_prev[
                "answered_correctly"].replace(-1, np.nan)
            df_test_prev["prior_question_had_explanation"] = df_test_prev[
                "prior_question_had_explanation"].fillna(-1).astype("int8")

            feature_factory_manager.fit(df_test_prev)

            df_test_prev = pd.DataFrame()
            answered_correctlies = []
            user_answers = []
        # 今回のデータ取得&計算

        # logger.info(f"[time: {int(time.time() - t)}dataload")
        logger.info(f"merge... ")
        w_df1 = pd.merge(df_test[df_test["content_type_id"] == 0],
                         df_question,
                         how="left",
                         left_on="content_id",
                         right_on="question_id")
        w_df2 = pd.merge(df_test[df_test["content_type_id"] == 1],
                         df_lecture,
                         how="left",
                         left_on="content_id",
                         right_on="lecture_id")
        df_test = pd.concat([w_df1, w_df2])
        df_test["tag"] = df_test["tag"].fillna(-1)
        df_test["correct_answer"] = df_test["correct_answer"].fillna(-1)
        df_test["bundle_id"] = df_test["bundle_id"].fillna(-1)

        logger.info(f"transform... ")
        df_test["prior_question_had_explanation"] = df_test[
            "prior_question_had_explanation"].astype("float16").fillna(
                -1).astype("int8")

        df = feature_factory_manager.partial_predict(df_test)
        df.columns = [x.replace(" ", "_") for x in df.columns]
        logger.info(f"other... ")

        # predict
        predicts = []
        cols = models[0].feature_name()
        for model in models:
            predicts.append(model.predict(df[cols]))

        df["answered_correctly"] = np.array(predicts).transpose().mean(axis=1)
        df_sample_prediction = pd.merge(df_sample_prediction[["row_id"]],
                                        df[["row_id", "answered_correctly"]],
                                        how="inner")
        env.predict(df_sample_prediction)
        df_test_prev = df_test_prev.append(df[cols + ["user_id", "tags"]])
        if debug:
            df_test_prev.to_csv(f"{i}.csv")
Esempio n. 4
0
def run(debug, model_dir, kaggle=False):

    if kaggle:
        files_dir = "/kaggle/input/riiid-split10/*.pickle"
    else:
        files_dir = "../input/riiid-test-answer-prediction/split10_base/*.pickle"

    logger = get_logger()
    # environment
    env = riiideducation.make_env()

    df_question = pd.read_csv(
        "../input/riiid-test-answer-prediction/questions.csv",
        dtype={
            "bundle_id": "int32",
            "question_id": "int32",
            "correct_answer": "int8",
            "part": "int8"
        })
    df_lecture = pd.read_csv(
        "../input/riiid-test-answer-prediction/lectures.csv",
        dtype={
            "lecture_id": "int32",
            "tag": "int16",
            "part": "int8"
        })
    # model loading
    models = []
    for model_path in glob.glob(f"{model_dir}/*model*.pickle"):
        with open(model_path, "rb") as f:
            models.append(pickle.load(f))

    # load feature_factory_manager
    logger = get_logger()
    ff_manager_path = f"{model_dir}/feature_factory_manager.pickle"
    with open(ff_manager_path, "rb") as f:
        feature_factory_manager = pickle.load(f)
    for dicts in feature_factory_manager.feature_factory_dict.values():
        for factory in dicts.values():
            factory.logger = logger
    feature_factory_manager.logger = logger

    iter_test = env.iter_test()
    df_test_prev = pd.DataFrame()
    answered_correctlies = []
    user_answers = []
    i = 0
    t = time.time()
    for (df_test, df_sample_prediction) in iter_test:
        i += 1
        logger.info(
            f"[time: {int(time.time() - t)}iteration {i}: data_length: {len(df_test)}"
        )
        # 前回のデータ更新
        if len(df_test_prev) > 0:  # 初回のみパスするためのif
            answered_correctly = df_test.iloc[0]["prior_group_answers_correct"]
            user_answer = df_test.iloc[0]["prior_group_responses"]
            answered_correctlies.extend([
                int(x) for x in answered_correctly.replace("[", "").replace(
                    "'", "").replace("]", "").replace(" ", "").split(",")
            ])
            user_answers.extend([
                int(x)
                for x in user_answer.replace("[", "").replace("'", "").replace(
                    "]", "").replace(" ", "").split(",")
            ])

        if debug:
            update_record = 1
        else:
            update_record = 50
        if len(df_test_prev) > update_record:
            df_test_prev["answered_correctly"] = answered_correctlies
            df_test_prev["user_answer"] = user_answers
            # df_test_prev = df_test_prev.drop(prior_columns, axis=1)
            df_test_prev = df_test_prev[
                df_test_prev["answered_correctly"] != -1]
            df_test_prev["answered_correctly"] = df_test_prev[
                "answered_correctly"].replace(-1, np.nan)
            df_test_prev["prior_question_had_explanation"] = df_test_prev[
                "prior_question_had_explanation"].fillna(-1).astype("int8")

            feature_factory_manager.fit(df_test_prev)

            df_test_prev = pd.DataFrame()
            answered_correctlies = []
            user_answers = []
        # 今回のデータ取得&計算

        # logger.info(f"[time: {int(time.time() - t)}dataload")
        logger.info(f"merge... ")
        w_df1 = pd.merge(df_test[df_test["content_type_id"] == 0],
                         df_question,
                         how="left",
                         left_on="content_id",
                         right_on="question_id")
        w_df2 = pd.merge(df_test[df_test["content_type_id"] == 1],
                         df_lecture,
                         how="left",
                         left_on="content_id",
                         right_on="lecture_id")
        df_test = pd.concat([w_df1,
                             w_df2]).sort_values(["user_id",
                                                  "timestamp"]).sort_index()
        df_test["tag"] = df_test["tag"].fillna(-1)
        df_test["correct_answer"] = df_test["correct_answer"].fillna(-1)
        df_test["bundle_id"] = df_test["bundle_id"].fillna(-1)

        logger.info(f"transform... ")
        df_test["prior_question_had_explanation"] = df_test[
            "prior_question_had_explanation"].astype("float16").fillna(
                -1).astype("int8")

        df = feature_factory_manager.partial_predict(df_test)
        df.columns = [x.replace(" ", "_") for x in df.columns]
        logger.info(f"other... ")

        # predict
        predicts = []
        cols = models[0].feature_name()
        for model in models:
            predicts.append(model.predict(df[cols]))

        df["answered_correctly"] = np.array(predicts).transpose().mean(axis=1)
        df_sample_prediction = pd.merge(df_sample_prediction[["row_id"]],
                                        df[["row_id", "answered_correctly"]],
                                        how="inner")
        env.predict(df_sample_prediction)
        df_test_prev = df_test_prev.append(df[cols + ["user_id", "tags"]])
        if i < 5:
            df_test_prev.to_csv(f"{i}.csv")
Esempio n. 5
0
import riiideducation
env = riiideducation.make_env()
iter_test = env.iter_test()


import sys
import logging
PATH = '/kaggle/input/riiid-saint-model'
sys.path.append(PATH)


from riiid.utils import configure_console_logging, check_versions
from riiid.saint.model import SaintModel


configure_console_logging()
check_versions()

logging.info('Load model')
MODEL_ID = 'saint_20210101_132425'
model: SaintModel = SaintModel.load(PATH, MODEL_ID)
model.load_model_from_path('gs://riiid-models/{}_model'.format(MODEL_ID))

for test, _ in iter_test:
    test = model.update(test)
    _, predictions = model.predict(test)
    env.predict(predictions)
Esempio n. 6
0
def run(debug, model_dir, update_record, kaggle=False):

    # environment
    env = riiideducation.make_env()

    df_question = pd.read_csv(
        "../input/riiid-test-answer-prediction/questions.csv",
        dtype={
            "bundle_id": "int32",
            "question_id": "int32",
            "correct_answer": "int8",
            "part": "int8"
        })
    df_lecture = pd.read_csv(
        "../input/riiid-test-answer-prediction/lectures.csv",
        dtype={
            "lecture_id": "int32",
            "tag": "int16",
            "part": "int8"
        })
    # params
    with open(f"{model_dir}/transformer_param.json", "r") as f:
        params = json.load(f)
    # model loading
    models = []
    for model_path in glob.glob(f"{model_dir}/*transformer*.pth"):
        model = SAKTModel(13782,
                          embed_dim=params["embed_dim"],
                          max_seq=params["max_seq"])
        torch.load(model_path)
        model.load_state_dict(torch.load(model_path))
        model.to(device)
        models.append(model)

    # load feature_factory_manager
    logger = get_logger()
    ff_manager_path_for_transformer = f"{model_dir}/feature_factory_manager_for_transformer.pickle"
    with open(ff_manager_path_for_transformer, "rb") as f:
        feature_factory_manager_for_transformer = pickle.load(f)
    feature_factory_manager_for_transformer.logger = logger

    iter_test = env.iter_test()
    df_test_prev = []
    df_test_prev_rows = 0
    answered_correctlies = []
    user_answers = []
    i = 0
    for (df_test, df_sample_prediction) in iter_test:
        i += 1
        logger.info(f"[iteration {i}: data_length: {len(df_test)}")
        # 前回のデータ更新
        if df_test_prev_rows > 0:  # 初回のみパスするためのif
            answered_correctly = df_test.iloc[0]["prior_group_answers_correct"]
            user_answer = df_test.iloc[0]["prior_group_responses"]
            answered_correctlies.extend([
                int(x) for x in answered_correctly.replace("[", "").replace(
                    "'", "").replace("]", "").replace(" ", "").split(",")
            ])
            user_answers.extend([
                int(x)
                for x in user_answer.replace("[", "").replace("'", "").replace(
                    "]", "").replace(" ", "").split(",")
            ])

        if debug:
            update_record = 1
        if df_test_prev_rows > update_record:
            logger.info("------ fitting ------")
            logger.info("concat df")
            df_test_prev = pd.concat(df_test_prev)
            df_test_prev["answered_correctly"] = answered_correctlies
            df_test_prev["user_answer"] = user_answers
            # df_test_prev = df_test_prev.drop(prior_columns, axis=1)
            df_test_prev = df_test_prev[
                df_test_prev["answered_correctly"] != -1]
            df_test_prev["answered_correctly"] = df_test_prev[
                "answered_correctly"].replace(-1, np.nan)
            df_test_prev["prior_question_had_explanation"] = df_test_prev[
                "prior_question_had_explanation"].fillna(-1).astype("int8")

            logger.info("fit data")
            feature_factory_manager_for_transformer.fit(df_test_prev)

            df_test_prev = []
            df_test_prev_rows = 0
            answered_correctlies = []
            user_answers = []
        # 今回のデータ取得&計算

        # logger.info(f"[time: {int(time.time() - t)}dataload")
        logger.info(f"------ question&lecture merge ------")
        w_df1 = pd.merge(df_test[df_test["content_type_id"] == 0],
                         df_question,
                         how="left",
                         left_on="content_id",
                         right_on="question_id")
        w_df2 = pd.merge(df_test[df_test["content_type_id"] == 1],
                         df_lecture,
                         how="left",
                         left_on="content_id",
                         right_on="lecture_id")
        df_test = pd.concat([w_df1,
                             w_df2]).sort_values(["user_id",
                                                  "timestamp"]).sort_index()
        df_test["tag"] = df_test["tag"].fillna(-1)
        df_test["correct_answer"] = df_test["correct_answer"].fillna(-1)
        df_test["bundle_id"] = df_test["bundle_id"].fillna(-1)

        logger.info(f"------ transform ------ ")
        df_test["prior_question_had_explanation"] = df_test[
            "prior_question_had_explanation"].astype("float16").fillna(
                -1).astype("int8")

        df = df_test
        group = feature_factory_manager_for_transformer.partial_predict(
            df_test[df_test["content_type_id"] == 0])
        logger.info(f"------ predict ------")

        dataset_val = SAKTDataset(group,
                                  13782,
                                  predict_mode=True,
                                  max_seq=params["max_seq"])
        dataloader_val = DataLoader(dataset_val,
                                    batch_size=1024,
                                    shuffle=False,
                                    num_workers=1)

        predicts = []
        for d in dataloader_val:
            x = d[0].to(device).long()
            target_id = d[1].to(device).long()
            part = d[2].to(device).long()
            label = d[3].to(device).long()

            output, atten_weight = model(x, target_id, part)

            predicts.extend(torch.nn.Sigmoid()(
                output[:, -1]).view(-1).data.cpu().numpy().tolist())

        logger.info("------ other ------")
        df_sample_prediction = df[df["content_type_id"] == 0][["row_id"]]
        df_sample_prediction["answered_correctly"] = predicts
        env.predict(df_sample_prediction)
        df_test_prev.append(df)
        df_test_prev_rows += len(df)
        if i < 5:
            df.to_csv(f"{i}.csv")
Esempio n. 7
0
def run(debug, model_dir, kaggle=False, rewrite=False):

    if kaggle:
        files_dir = "/kaggle/input/riiid-split10/*.feather"
    else:
        files_dir = "../input/riiid-test-answer-prediction/split10/*.feather"

    logger = get_logger()
    # environment
    env = riiideducation.make_env()

    # model loading
    models = []
    for model_path in glob.glob(f"{model_dir}/*model*.pickle"):
        with open(model_path, "rb") as f:
            models.append(pickle.load(f))

    # data preprocessing
    data_dir = "../work_csv"
    if rewrite:
        if os.path.isdir(data_dir):
            shutil.rmtree(data_dir)
        os.makedirs(data_dir, exist_ok=True)
        for model_id, fname in enumerate(glob.glob(files_dir)):
            logger.info(f"loading... {fname}")
            df = pd.read_pickle(fname)
            if debug:
                df = df.head(1000)
            df = transform(df)

            for user_id, w_df in tqdm.tqdm(df.groupby("user_id")):
                os.makedirs(f"{data_dir}/{user_id}/", exist_ok=True)
                w_df.to_pickle(f"{data_dir}/{user_id}/original.feather")

    iter_test = env.iter_test()
    df_test_prev = pd.DataFrame()
    i = 0
    t = time.time()
    for (df_test, df_sample_prediction) in iter_test:
        i += 1
        logger.info(
            f"[time: {int(time.time() - t)}iteration {i}: data_length: {len(df_test)}"
        )
        # 前回のデータ更新
        if len(df_test_prev) > 0:
            answered_correctly = df_test.iloc[0]["prior_group_answers_correct"]
            user_answer = df_test.iloc[0]["prior_group_responses"]

            df_test_prev["answered_correctly"] = [
                int(x) for x in answered_correctly.replace("[", "").replace(
                    "'", "").replace("]", "").replace(" ", "").split(",")
            ]
            df_test_prev["user_answer"] = [
                int(x)
                for x in user_answer.replace("[", "").replace("'", "").replace(
                    "]", "").replace(" ", "").split(",")
            ]
            # df_test_prev = df_test_prev.drop(prior_columns, axis=1)

            for user_id, df_prev in df_test_prev.groupby("user_id"):
                # logger.info(f"[time: {int(time.time() - t)}iteration {i}")
                os.makedirs(f"{data_dir}/{user_id}/", exist_ok=True)
                data_id = len(f"{data_dir}/{user_id}")
                df_prev.to_pickle(f"{data_dir}/{user_id}/{data_id}.pickle")

        # 今回のデータ取得&計算
        dfs = []

        df_nows = []
        for user_id, df_now in df_test.groupby("user_id"):
            # logger.info(f"[time: {int(time.time() - t)}dataload")
            fnames = glob.glob(f"{data_dir}/{user_id}/*.feather")
            if len(fnames) > 0:
                read_dfs = [pd.read_pickle(x) for x in fnames]
                df = pd.concat(read_dfs + [df_now]).reset_index(drop=True)
            else:
                df = df_now[:]
                df["user_answer"] = -1
                df["answered_correctly"] = -1
                df = df.astype(data_types_dict)
            df = transform(df)
            cols = models[0].feature_name()
            for col in cols:
                if col not in df.columns:
                    df[col] = -99999
            dfs.append(df[cols].iloc[-len(df_now):].drop("row_id",
                                                         axis=1,
                                                         errors="ignore"))
            df_nows.append(df_now)

        # predict
        df_test_prev = pd.concat(dfs)
        logger.info(f"[time: {int(time.time() - t)}model")
        predicts = []
        for model in models:
            predicts.append(model.predict(df_test_prev))

        df_nows = pd.concat(df_nows)
        df_nows["answered_correctly"] = np.array(predicts).transpose().mean(
            axis=1)
        df_sample_prediction = pd.merge(
            df_sample_prediction[["row_id"]],
            df_nows[["row_id", "answered_correctly"]],
            how="inner")
        env.predict(df_sample_prediction)
Esempio n. 8
0
def run(debug, model_dir, update_record, kaggle=False):

    if kaggle:
        files_dir = "/kaggle/input/riiid-split10/*.pickle"
    else:
        files_dir = "../input/riiid-test-answer-prediction/split10_base/*.pickle"

    logger = get_logger()
    # environment
    env = riiideducation.make_env()

    df_question = pd.read_csv(
        "../input/riiid-test-answer-prediction/questions.csv",
        dtype={
            "bundle_id": "int32",
            "question_id": "int32",
            "correct_answer": "int8",
            "part": "int8"
        })
    df_lecture = pd.read_csv(
        "../input/riiid-test-answer-prediction/lectures.csv",
        dtype={
            "lecture_id": "int32",
            "tag": "int16",
            "part": "int8"
        })
    # model loading
    models = []
    for model_path in glob.glob(f"{model_dir}/*model*.pickle"):
        with open(model_path, "rb") as f:
            models.append(pickle.load(f))

    # load feature_factory_manager
    logger = get_logger()
    ff_manager_path = f"{model_dir}/feature_factory_manager.pickle"
    with open(ff_manager_path, "rb") as f:
        feature_factory_manager = pickle.load(f)
    for dicts in feature_factory_manager.feature_factory_dict.values():
        for factory in dicts.values():
            factory.logger = logger
    feature_factory_manager.logger = logger

    iter_test = env.iter_test()
    df_test_prev = []
    df_test_prev_rows = 0
    answered_correctlies = []
    user_answers = []
    i = 0

    df_all = pd.read_pickle(f"{os.path.dirname(files_dir)}/train_0.pickle").head(1000000)\
        #.drop(

    #["question_id", "bundle_id", "correct_answer", "part", "lecture_id", "tag", "part", "type_of"], axis=1
    #)
    for idx in range(len(df_all) // 20):
        df_test = df_all.iloc[i * 20:(i + 1) * 20]
        i += 1
        logger.info(f"[iteration {i}: data_length: {len(df_test)}")
        # 前回のデータ更新
        """
        if df_test_prev_rows > 0: # 初回のみパスするためのif
            answered_correctly = df_test.iloc[0]["prior_group_answers_correct"]
            user_answer = df_test.iloc[0]["prior_group_responses"]
            answered_correctlies.extend([int(x) for x in answered_correctly.replace("[", "").replace("'", "").replace("]", "").replace(" ", "").split(",")])
            user_answers.extend([int(x) for x in user_answer.replace("[", "").replace("'", "").replace("]", "").replace(" ", "").split(",")])
        """
        if debug:
            update_record = 1
        if df_test_prev_rows > update_record:
            logger.info("------ fitting ------")
            logger.info("concat df")
            df_test_prev = pd.concat(df_test_prev)
            # df_test_prev["answered_correctly"] = answered_correctlies
            # df_test_prev["user_answer"] = user_answers
            # df_test_prev = df_test_prev.drop(prior_columns, axis=1)
            df_test_prev = df_test_prev[
                df_test_prev["answered_correctly"] != -1]
            df_test_prev["answered_correctly"] = df_test_prev[
                "answered_correctly"].replace(-1, np.nan)
            df_test_prev["prior_question_had_explanation"] = df_test_prev[
                "prior_question_had_explanation"].fillna(-1).astype("int8")

            logger.info("fit data")
            feature_factory_manager.fit(df_test_prev)

            df_test_prev = []
            df_test_prev_rows = 0
            answered_correctlies = []
            user_answers = []
        # 今回のデータ取得&計算

        # logger.info(f"[time: {int(time.time() - t)}dataload")
        logger.info(f"------ question&lecture merge ------")
        w_df1 = pd.merge(df_test[df_test["content_type_id"] == 0],
                         df_question,
                         how="left",
                         left_on="content_id",
                         right_on="question_id")
        w_df2 = pd.merge(df_test[df_test["content_type_id"] == 1],
                         df_lecture,
                         how="left",
                         left_on="content_id",
                         right_on="lecture_id")
        df_test = pd.concat([w_df1,
                             w_df2]).sort_values(["user_id",
                                                  "timestamp"]).sort_index()
        df_test["tag"] = df_test["tag"].fillna(-1)
        df_test["correct_answer"] = df_test["correct_answer"].fillna(-1)
        df_test["bundle_id"] = df_test["bundle_id"].fillna(-1)
        logger.info(f"------ transform ------ ")
        df_test["prior_question_had_explanation"] = df_test[
            "prior_question_had_explanation"].astype("float16").fillna(
                -1).astype("int8")

        df = feature_factory_manager.partial_predict(df_test)
        df.columns = [x.replace(" ", "_") for x in df.columns]
        logger.info(f"------ predict ------")

        # predict
        predicts = []
        cols = models[0].feature_name()
        w_df = df[cols]
        for model in models:
            predicts.append(model.predict(w_df))

        logger.info("------ other ------")
        df["answered_correctly"] = np.array(predicts).transpose().mean(axis=1)
        df_sample_prediction = df[df["content_type_id"] == 0][[
            "answered_correctly"
        ]]
        df_test_prev.append(df[cols +
                               ["user_id", "tags", "answered_correctly"]])
        df_test_prev_rows += len(df)
        if i < 5:
            df.to_csv(f"{i}.csv")
def inference(TARGET, FEATURES, model, prior_question_elapsed_time_mean,
              features_dicts, lectures_df, q_taglist_df):

    # Get feature dict
    answered_correctly_u_count = features_dicts['answered_correctly_u_count']
    answered_correctly_u_sum = features_dicts['answered_correctly_u_sum']
    elapsed_time_u_sum = features_dicts['elapsed_time_u_sum']
    explanation_u_sum = features_dicts['explanation_u_sum']
    answered_correctly_q_count = features_dicts['answered_correctly_q_count']

    elapsed_time_q_sum = features_dicts['elapsed_time_q_sum']
    explanation_q_sum = features_dicts['explanation_q_sum']
    timestamp_u = features_dicts['timestamp_u']
    timestamp_u_incorrect = features_dicts['timestamp_u_incorrect']
    answered_correctly_up_count = features_dicts['answered_correctly_up_count']
    answered_correctly_up_sum = features_dicts['answered_correctly_up_sum']
    answered_correctly_tag1_count = features_dicts[
        'answered_correctly_tag1_count']
    answered_correctly_tag1_sum = features_dicts['answered_correctly_tag1_sum']

    lect_dict = features_dicts['lect_dict']
    tag_list_dict = features_dicts['tag_list_dict']

    # Get api iterator and predictor
    env = riiideducation.make_env()
    iter_test = env.iter_test()
    set_predict = env.predict
    questions_df = pd.read_pickle('questions_df.pkl')
    previous_test_df = None
    for (test_df, sample_prediction_df) in iter_test:
        if previous_test_df is not None:
            previous_test_df[TARGET] = eval(
                test_df["prior_group_answers_correct"].iloc[0])
            update_features(previous_test_df, answered_correctly_u_sum,
                            answered_correctly_u_count,
                            answered_correctly_q_count, timestamp_u_incorrect,
                            answered_correctly_up_count,
                            answered_correctly_up_sum,
                            answered_correctly_tag1_count,
                            answered_correctly_tag1_sum)
        test_df[
            'prior_question_had_explanation'] = test_df.prior_question_had_explanation.fillna(
                False).astype('int8')
        test_df['prior_question_elapsed_time'].fillna(
            prior_question_elapsed_time_mean, inplace=True)
        question_cols = [
            'question_id', 'part', 'tag_1', 'answered_correctly_q_mean',
            'answered_correctly_q_std', 'answered_correctly_p_mean',
            'answered_correctly_p_std', 'answered_correctly_b_mean',
            'answered_correctly_b_std', 'answered_correctly_tag_1_mean',
            'answered_correctly_tag_1_std'
        ]
        test_df = pd.merge(test_df,
                           questions_df[question_cols],
                           left_on='content_id',
                           right_on='question_id',
                           how='left')
        previous_test_df = test_df.copy()
        test_df = add_lectures_feats(test_df, lect_dict, tag_list_dict,
                                     lectures_df, q_taglist_df)
        test_df = test_df[test_df['content_type_id'] == 0].reset_index(
            drop=True)
        test_df[TARGET] = 0
        test_df = add_features(test_df,
                               answered_correctly_u_count,
                               answered_correctly_u_sum,
                               elapsed_time_u_sum,
                               explanation_u_sum,
                               timestamp_u,
                               timestamp_u_incorrect,
                               answered_correctly_q_count,
                               elapsed_time_q_sum,
                               explanation_q_sum,
                               answered_correctly_up_count,
                               answered_correctly_up_sum,
                               answered_correctly_tag1_count,
                               answered_correctly_tag1_sum,
                               update=False)
        test_df[TARGET] = model.predict(test_df[FEATURES])
        set_predict(test_df[['row_id', TARGET]])

    print('Job Done')
Esempio n. 10
0
def run(debug, model_dir, kaggle=False):

    if kaggle:
        files_dir = "/kaggle/input/riiid-split10/*.pickle"
    else:
        files_dir = "../input/riiid-test-answer-prediction/split10/*.pickle"

    logger = get_logger()
    # environment
    env = riiideducation.make_env()

    # model loading
    models = []
    for model_path in glob.glob(f"{model_dir}/*model*.pickle"):
        with open(model_path, "rb") as f:
            models.append(pickle.load(f))

    # data preprocessing
    logger = get_logger()
    feature_factory_dict = {}
    for column in [
            "user_id", "content_id", "content_type_id",
            "prior_question_had_explanation"
    ]:
        feature_factory_dict[column] = {
            "CountEncoder": CountEncoder(column=column),
            "TargetEncoder": TargetEncoder(column=column)
        }
    feature_factory_dict["user_id"][
        "MeanAggregatorTimestamp"] = MeanAggregator(column="user_id",
                                                    agg_column="timestamp",
                                                    remove_now=False)
    feature_factory_dict["user_id"][
        "MeanAggregatorPriorQuestionElapsedTime"] = MeanAggregator(
            column="user_id",
            agg_column="prior_question_elapsed_time",
            remove_now=True)
    feature_factory_dict["content_id"][
        "MeanAggregatorPriorQuestionElapsedTime"] = MeanAggregator(
            column="content_id",
            agg_column="prior_question_elapsed_time",
            remove_now=True)
    for column in [("user_id", "content_type_id"),
                   ("user_id", "prior_question_had_explanation")]:
        feature_factory_dict[column] = {
            "CountEncoder": CountEncoder(column=list(column)),
            "TargetEncoder": TargetEncoder(column=list(column))
        }
    feature_factory_manager = FeatureFactoryManager(
        feature_factory_dict=feature_factory_dict, logger=logger)

    for model_id, fname in enumerate(glob.glob(files_dir)):
        logger.info(f"loading... {fname}")
        df = pd.read_pickle(fname)
        df["prior_question_had_explanation"] = df[
            "prior_question_had_explanation"].astype("float16").fillna(
                -1).astype("int8")

        if debug:
            df = df.head(1000)
        feature_factory_manager.fit(df)

    iter_test = env.iter_test()
    df_test_prev = pd.DataFrame()
    i = 0
    t = time.time()
    for (df_test, df_sample_prediction) in iter_test:
        i += 1
        logger.info(
            f"[time: {int(time.time() - t)}iteration {i}: data_length: {len(df_test)}"
        )
        # 前回のデータ更新
        if len(df_test_prev) > 0:
            answered_correctly = df_test.iloc[0]["prior_group_answers_correct"]
            user_answer = df_test.iloc[0]["prior_group_responses"]

            df_test_prev["answered_correctly"] = [
                int(x) for x in answered_correctly.replace("[", "").replace(
                    "'", "").replace("]", "").replace(" ", "").split(",")
            ]
            df_test_prev["user_answer"] = [
                int(x)
                for x in user_answer.replace("[", "").replace("'", "").replace(
                    "]", "").replace(" ", "").split(",")
            ]
            # df_test_prev = df_test_prev.drop(prior_columns, axis=1)

            feature_factory_manager.fit(df_test_prev)
        # 今回のデータ取得&計算

        # logger.info(f"[time: {int(time.time() - t)}dataload")
        logger.info(f"transform... ")
        df_test["prior_question_had_explanation"] = df_test[
            "prior_question_had_explanation"].astype("float16").fillna(
                -1).astype("int8")

        df = feature_factory_manager.partial_predict(df_test)
        logger.info(f"other... ")
        cols = models[0].feature_name()
        for col in cols:
            if col not in df.columns:
                df[col] = -99999

        # predict
        predicts = []
        cols = models[0].feature_name()
        for model in models:
            predicts.append(model.predict(df[cols]))

        df["answered_correctly"] = np.array(predicts).transpose().mean(axis=1)
        df_sample_prediction = pd.merge(df_sample_prediction[["row_id"]],
                                        df[["row_id", "answered_correctly"]],
                                        how="inner")
        env.predict(df_sample_prediction)
        df_test_prev = df[cols + ["user_id"]]

        df_test_prev.to_csv(f"{i}.csv")
Esempio n. 11
0
def run(debug,
        model_dir,
        update_record,
        kaggle=False):

    # environment
    env = riiideducation.make_env()

    df_question = pd.read_csv("../input/riiid-test-answer-prediction/questions.csv",
                              dtype={"bundle_id": "int32",
                                     "question_id": "int32",
                                     "correct_answer": "int8",
                                     "part": "int8"})
    df_lecture = pd.read_csv("../input/riiid-test-answer-prediction/lectures.csv",
                             dtype={"lecture_id": "int32",
                                    "tag": "int16",
                                    "part": "int8"})
    # params
    with open(f"{model_dir}/transformer_param.json", "r") as f:
        params = json.load(f)
    # model loading
    model_path = f"{model_dir}/transformers.pth"
    model = SAKTModel(13938, embed_dim=params["embed_dim"], max_seq=params["max_seq"], cont_emb=8)
    model.load_state_dict(torch.load(model_path))
    model.to(device)
    model.eval()

    # load feature_factory_manager
    logger = get_logger()
    ff_manager_path_for_transformer = f"{model_dir}/feature_factory_manager_for_transformer.pickle"
    with open(ff_manager_path_for_transformer, "rb") as f:
        feature_factory_manager_for_transformer = pickle.load(f)
    feature_factory_manager_for_transformer.logger = logger

    ff_manager_path = f"{model_dir}/feature_factory_manager.pickle"
    with open(ff_manager_path, "rb") as f:
        feature_factory_manager = pickle.load(f)
    feature_factory_manager.logger = logger


    iter_test = env.iter_test()
    df_test_prev = []
    df_test_prev_rows = 0
    answered_correctlies = []
    user_answers = []
    i = 0
    for (df_test, df_sample_prediction) in iter_test:
        i += 1
        logger.info(f"[iteration {i}: data_length: {len(df_test)}")
        # 前回のデータ更新
        if df_test_prev_rows > 0: # 初回のみパスするためのif
            answered_correctly = df_test.iloc[0]["prior_group_answers_correct"]
            user_answer = df_test.iloc[0]["prior_group_responses"]
            answered_correctlies.extend([int(x) for x in answered_correctly.replace("[", "").replace("'", "").replace("]", "").replace(" ", "").split(",")])
            user_answers.extend([int(x) for x in user_answer.replace("[", "").replace("'", "").replace("]", "").replace(" ", "").split(",")])

        if debug:
            update_record = 1
        if df_test_prev_rows > update_record:
            logger.info("------ fitting ------")
            logger.info("concat df")
            df_test_prev = pd.concat(df_test_prev)
            df_test_prev["answered_correctly"] = answered_correctlies
            df_test_prev["user_answer"] = user_answers
            # df_test_prev = df_test_prev.drop(prior_columns, axis=1)
            # df_test_prev = df_test_prev[df_test_prev["answered_correctly"] != -1]
            df_test_prev["answered_correctly"] = df_test_prev["answered_correctly"].replace(-1, np.nan)
            df_test_prev["prior_question_had_explanation"] = df_test_prev["prior_question_had_explanation"].fillna(-1).astype("int8")

            logger.info("fit data")
            feature_factory_manager.fit(df_test_prev)
            df_test_prev["answered_correctly"] = df_test_prev["answered_correctly"].replace(np.nan, -1)
            feature_factory_manager_for_transformer.fit(df_test_prev)

            df_test_prev = []
            df_test_prev_rows = 0
            answered_correctlies = []
            user_answers = []
        # 今回のデータ取得&計算

        # logger.info(f"[time: {int(time.time() - t)}dataload")
        logger.info(f"------ question&lecture merge ------")
        w_df1 = pd.merge(df_test[df_test["content_type_id"] == 0], df_question, how="left", left_on="content_id",
                         right_on="question_id")
        w_df2 = pd.merge(df_test[df_test["content_type_id"] == 1], df_lecture, how="left", left_on="content_id",
                         right_on="lecture_id")
        df_test = pd.concat([w_df1, w_df2]).sort_values(["user_id", "timestamp"]).sort_index()
        df_test["tag"] = df_test["tag"].fillna(-1)
        df_test["correct_answer"] = df_test["correct_answer"].fillna(-1)
        df_test["bundle_id"] = df_test["bundle_id"].fillna(-1)
        df_test["task_container_id_bin300"] = [x if x < 300 else 300 for x in df_test["task_container_id"].values]

        logger.info(f"------ transform ------ ")
        df_test["prior_question_had_explanation"] = df_test["prior_question_had_explanation"].astype("float16").fillna(-1).astype("int8")

        df_test = feature_factory_manager.partial_predict(df_test)
        group = feature_factory_manager_for_transformer.partial_predict(df_test[df_test["content_type_id"] == 0])
        logger.info(f"------ predict ------")

        dataset_val = SAKTDataset(group, 13939, predict_mode=True, max_seq=params["max_seq"])
        dataloader_val = DataLoader(dataset_val, batch_size=1024, shuffle=False, num_workers=1)

        predicts = []

        with torch.no_grad():
            for item in dataloader_val:
                x = item["x"].to(device).long()
                target_id = item["target_id"].to(device).long()
                part = item["part"].to(device).long()
                label = item["label"].to(device).float()
                elapsed_time = item["elapsed_time"].to(device).long()
                duration_previous_content = item["duration_previous_content"].to(device).long()
                prior_question_had_explanation = item["prior_q"].to(device).long()
                user_answer = item["user_answer"].to(device).long()
                rate_diff = item["rate_diff"].to(device).float()
                container_id = item["container_id"].to(device).long()
                prev_ans_idx = item["previous_answer_index_content_id"].to(device).long()
                prev_answer_content_id = item["previous_answer_content_id"].to(device).long()

                output = model(x, target_id, part, elapsed_time,
                               duration_previous_content, prior_question_had_explanation, user_answer,
                               rate_diff, container_id, prev_ans_idx, prev_answer_content_id)

                predicts.extend(torch.nn.Sigmoid()(output[:, -1]).view(-1).data.cpu().numpy().tolist())

        logger.info("------ other ------")
        df_sample_prediction = df_test[df_test["content_type_id"] == 0][["row_id"]]
        df_sample_prediction["answered_correctly"] = predicts
        env.predict(df_sample_prediction)
        df_test_prev.append(df_test)
        df_test_prev_rows += len(df_test)
        if i < 5:
            df_test.to_csv(f"{i}.csv")
        if i == 3:
            class EmptyLogger:
                def __init__(self):
                    pass

                def info(self, s):
                    pass

            logger = EmptyLogger()
def run(debug, model_dir, kaggle=False):

    if kaggle:
        files_dir = "/kaggle/input/riiid-split10/*.pickle"
    else:
        files_dir = "../input/riiid-test-answer-prediction/split10_base/*.pickle"

    logger = get_logger()
    # environment
    env = riiideducation.make_env()

    df_question = pd.read_csv(
        "../input/riiid-test-answer-prediction/questions.csv",
        dtype={
            "bundle_id": "int32",
            "question_id": "int32",
            "correct_answer": "int8",
            "part": "int8"
        })
    df_lecture = pd.read_csv(
        "../input/riiid-test-answer-prediction/lectures.csv",
        dtype={
            "lecture_id": "int32",
            "tag": "int16",
            "part": "int8"
        })
    # model loading
    models = []
    for model_path in glob.glob(f"{model_dir}/*model*.pickle"):
        with open(model_path, "rb") as f:
            models.append(pickle.load(f))

    # data preprocessing
    logger = get_logger()
    feature_factory_dict = {}
    feature_factory_dict["tags"] = {"TagsSeparator": TagsSeparator()}
    for column in [
            "content_id", "user_id", "part", "prior_question_had_explanation",
            "tags1", "tags2", ("user_id", "prior_question_had_explanation"),
        ("user_id", "part")
    ]:
        is_partial_fit = (column == "content_id" or column == "user_id")

        if type(column) == str:
            feature_factory_dict[column] = {
                "CountEncoder":
                CountEncoder(column=column, is_partial_fit=is_partial_fit),
                "TargetEncoder":
                TargetEncoder(column=column, is_partial_fit=is_partial_fit)
            }
        else:
            feature_factory_dict[column] = {
                "CountEncoder":
                CountEncoder(column=list(column),
                             is_partial_fit=is_partial_fit),
                "TargetEncoder":
                TargetEncoder(column=list(column),
                              is_partial_fit=is_partial_fit)
            }
    feature_factory_dict["user_id"][
        "ShiftDiffEncoderTimestamp"] = ShiftDiffEncoder(groupby="user_id",
                                                        column="timestamp",
                                                        is_partial_fit=True)
    feature_factory_dict["user_id"][
        "ShiftDiffEncoderContentId"] = ShiftDiffEncoder(groupby="user_id",
                                                        column="content_id")
    for column in ["user_id", "content_id"]:
        feature_factory_dict[column][
            f"MeanAggregatorPriorQuestionElapsedTimeby{column}"] = MeanAggregator(
                column=column,
                agg_column="prior_question_elapsed_time",
                remove_now=True)

    feature_factory_dict["user_id"][
        "UserLevelEncoder2ContentId"] = UserLevelEncoder2(
            vs_column="content_id")
    feature_factory_dict["user_id"][
        "UserCountBinningEncoder"] = UserCountBinningEncoder(
            is_partial_fit=True)
    feature_factory_dict["user_count_bin"] = {}
    feature_factory_dict["user_count_bin"]["CountEncoder"] = CountEncoder(
        column="user_count_bin")
    feature_factory_dict["user_count_bin"]["TargetEncoder"] = TargetEncoder(
        column="user_count_bin")
    feature_factory_dict[("user_id", "user_count_bin")] = {
        "CountEncoder": CountEncoder(column=["user_id", "user_count_bin"]),
        "TargetEncoder": TargetEncoder(column=["user_id", "user_count_bin"])
    }
    feature_factory_dict[("content_id", "user_count_bin")] = {
        "CountEncoder": CountEncoder(column=["content_id", "user_count_bin"]),
        "TargetEncoder": TargetEncoder(column=["content_id", "user_count_bin"])
    }

    feature_factory_dict["user_id"][
        "CategoryLevelEncoderPart"] = CategoryLevelEncoder(
            groupby_column="user_id",
            agg_column="part",
            categories=[1, 2, 3, 4, 5, 6, 7])
    feature_factory_dict["user_count_bin"]["CategoryLevelEncoderUserCountBin"] = \
        CategoryLevelEncoder(groupby_column="user_id",
                             agg_column="user_count_bin",
                             categories=[0, 1, 2, 3, 4, 5])
    feature_factory_manager = FeatureFactoryManager(
        feature_factory_dict=feature_factory_dict, logger=logger)
    for model_id, fname in enumerate(glob.glob(files_dir)):
        logger.info(f"loading... {fname}")
        df = pd.read_pickle(fname)
        df = df[df["answered_correctly"] != -1]
        df["prior_question_had_explanation"] = df[
            "prior_question_had_explanation"].fillna(-1).astype("int8")
        if debug:
            df = df.head(1000)
        df = pd.concat([
            pd.merge(df[df["content_type_id"] == 0],
                     df_question,
                     how="left",
                     left_on="content_id",
                     right_on="question_id"),
            pd.merge(df[df["content_type_id"] == 1],
                     df_lecture,
                     how="left",
                     left_on="content_id",
                     right_on="lecture_id")
        ]).sort_values(["user_id", "timestamp"])
        # df = feature_factory_manager.feature_factory_dict["content_id"]["TargetEncoder"].all_predict(df)
        feature_factory_manager.fit(df, is_first_fit=True)

    for dicts in feature_factory_manager.feature_factory_dict.values():
        for factory in dicts.values():
            factory.logger = None
    feature_factory_manager.logger = None
    with open(f"feature_factory_manager.pickle", "wb") as f:
        pickle.dump(feature_factory_manager, f)
    return
Esempio n. 13
0
def run(debug, model_dir, update_record, kaggle=False):

    if kaggle:
        files_dir = "/kaggle/input/riiid-split10/*.pickle"
    else:
        files_dir = "../input/riiid-test-answer-prediction/split10_base/*.pickle"

    logger = get_logger()
    # environment
    env = riiideducation.make_env()

    df_question = pd.read_csv(
        "../input/riiid-test-answer-prediction/questions.csv",
        dtype={
            "bundle_id": "int32",
            "question_id": "int32",
            "correct_answer": "int8",
            "part": "int8"
        })
    df_lecture = pd.read_csv(
        "../input/riiid-test-answer-prediction/lectures.csv",
        dtype={
            "lecture_id": "int32",
            "tag": "int16",
            "part": "int8"
        })
    # model loading
    models_lgbm = []
    for model_path in glob.glob(f"{model_dir}/*lgbm*.pickle"):
        with open(model_path, "rb") as f:
            models_lgbm.append(pickle.load(f))
    models_cat = []
    params = {
        'n_estimators': 12000,
        'learning_rate': 0.3,
        'eval_metric': 'AUC',
        'loss_function': 'Logloss',
        'random_seed': 0,
        'metric_period': 50,
        'od_wait': 400,
        'task_type': 'GPU',
        'max_depth': 8,
        "verbose": 100
    }
    for model_path in glob.glob(f"{model_dir}/*catboost"):
        models_cat.append(CatBoostClassifier().load_model(model_path,
                                                          format="cbm"))

    print(models_cat[0].get_best_iteration())
    # load feature_factory_manager
    logger = get_logger()
    ff_manager_path = f"{model_dir}/feature_factory_manager.pickle"
    with open(ff_manager_path, "rb") as f:
        feature_factory_manager = pickle.load(f)
    for dicts in feature_factory_manager.feature_factory_dict.values():
        for factory in dicts.values():
            factory.logger = logger
    feature_factory_manager.logger = logger

    iter_test = env.iter_test()
    df_test_prev = []
    df_test_prev_rows = 0
    answered_correctlies = []
    user_answers = []
    i = 0
    for (df_test, df_sample_prediction) in iter_test:
        i += 1
        # logger.info(f"[iteration {i}: data_length: {len(df_test)}")
        # 前回のデータ更新
        if df_test_prev_rows > 0:  # 初回のみパスするためのif
            answered_correctly = df_test.iloc[0]["prior_group_answers_correct"]
            user_answer = df_test.iloc[0]["prior_group_responses"]
            answered_correctlies.extend([
                int(x) for x in answered_correctly.replace("[", "").replace(
                    "'", "").replace("]", "").replace(" ", "").split(",")
            ])
            user_answers.extend([
                int(x)
                for x in user_answer.replace("[", "").replace("'", "").replace(
                    "]", "").replace(" ", "").split(",")
            ])

        if df_test_prev_rows > update_record:
            # logger.info("------ fitting ------")
            # logger.info("concat df")
            df_test_prev = pd.concat(df_test_prev)
            df_test_prev["answered_correctly"] = answered_correctlies
            df_test_prev["user_answer"] = user_answers
            # df_test_prev = df_test_prev.drop(prior_columns, axis=1)
            df_test_prev = df_test_prev[
                df_test_prev["answered_correctly"] != -1]
            df_test_prev["answered_correctly"] = df_test_prev[
                "answered_correctly"].replace(-1, np.nan)
            df_test_prev["prior_question_had_explanation"] = df_test_prev[
                "prior_question_had_explanation"].fillna(-1).astype("int8")

            # logger.info("fit data")
            feature_factory_manager.fit(df_test_prev)

            df_test_prev = []
            df_test_prev_rows = 0
            answered_correctlies = []
            user_answers = []
        # 今回のデータ取得&計算

        # logger.info(f"[time: {int(time.time() - t)}dataload")
        # logger.info(f"merge... ")
        w_df1 = pd.merge(df_test[df_test["content_type_id"] == 0],
                         df_question,
                         how="left",
                         left_on="content_id",
                         right_on="question_id")
        w_df2 = pd.merge(df_test[df_test["content_type_id"] == 1],
                         df_lecture,
                         how="left",
                         left_on="content_id",
                         right_on="lecture_id")
        df_test = pd.concat([w_df1,
                             w_df2]).sort_values(["user_id",
                                                  "timestamp"]).sort_index()
        df_test["tag"] = df_test["tag"].fillna(-1)
        df_test["correct_answer"] = df_test["correct_answer"].fillna(-1)
        df_test["bundle_id"] = df_test["bundle_id"].fillna(-1)

        # logger.info(f"transform... ")
        df_test["prior_question_had_explanation"] = df_test[
            "prior_question_had_explanation"].astype("float16").fillna(
                -1).astype("int8")

        df = feature_factory_manager.partial_predict(df_test)
        df.columns = [
            x.replace("[", "_").replace("]", "_").replace("'", "_").replace(
                " ", "_").replace(",", "_") for x in df.columns
        ]

        # predict
        # logger.info(f"predict lgbm...")
        predicts_lgbm = []
        cols = models_cat[0].feature_names_
        w_df = df[cols]
        for model in models_lgbm:
            predicts_lgbm.append(model.predict(w_df))
        pred_lgbm = np.array(predicts_lgbm).mean(axis=0)

        # logger.info(f"predict cat...")
        predicts_cat = []
        for model in models_cat:
            predicts_cat.append(
                model.predict_proba(w_df.values)[:, 1].flatten())
        pred_cat = np.array(predicts_cat).mean(axis=0)

        # logger.info("other...")
        df["answered_correctly"] = pred_lgbm * 0.5 + pred_cat * 0.5
        df_sample_prediction = df[df["content_type_id"] == 0][[
            "row_id", "answered_correctly"
        ]]
        env.predict(df_sample_prediction)
        df_test_prev.append(df[cols + ["user_id", "tags"]])
        df_test_prev_rows += len(df)
        if i < 5:
            df.to_csv(f"{i}.csv")
Esempio n. 14
0
def inference(TARGET, FEATURES, sakt_model, lgb_model,
              prior_question_elapsed_time_mean, features_dicts):

    # Get feature dict
    answered_correctly_u_count = features_dicts['answered_correctly_u_count']
    answered_correctly_u_sum = features_dicts['answered_correctly_u_sum']
    elapsed_time_u_sum = features_dicts['elapsed_time_u_sum']
    explanation_u_sum = features_dicts['explanation_u_sum']
    answered_correctly_q_count = features_dicts['answered_correctly_q_count']
    answered_correctly_q_sum = features_dicts['answered_correctly_q_sum']
    answered_correctly_uq = features_dicts["answered_correctly_uq"]
    elapsed_time_q_sum = features_dicts['elapsed_time_q_sum']
    explanation_q_sum = features_dicts['explanation_q_sum']
    timestamp_u = features_dicts['timestamp_u']
    timestamp_u_incorrect = features_dicts['timestamp_u_incorrect']
    answered_correctly_up_count = features_dicts['answered_correctly_up_count']
    answered_correctly_up_sum = features_dicts['answered_correctly_up_sum']

    # Get api iterator and predictor
    env = riiideducation.make_env()
    iter_test = env.iter_test()
    set_predict = env.predict
    questions_df = pd.read_pickle(question_file)
    questions_df.part = questions_df.part.astype(np.int8)
    previous_test_df = None
    for (test_df, sample_prediction_df) in iter_test:
        if previous_test_df is not None:
            previous_test_df[TARGET] = eval(
                test_df["prior_group_answers_correct"].iloc[0])
            update_features(previous_test_df, answered_correctly_u_sum,
                            answered_correctly_u_count,
                            answered_correctly_q_sum,
                            answered_correctly_q_count, timestamp_u_incorrect,
                            answered_correctly_uq, answered_correctly_up_count,
                            answered_correctly_up_sum)
            previous_test_df = previous_test_df[
                previous_test_df.content_type_id == False]

            prev_group = previous_test_df[[
                'user_id', 'content_id', 'answered_correctly', 'part',
                'prior_question_elapsed_time_sakt'
            ]].groupby('user_id').apply(lambda r: (r['content_id'].values, r[
                'answered_correctly'].values, r['part'].values, r[
                    'prior_question_elapsed_time_sakt'].values))
            for prev_user_id in prev_group.index:
                if prev_user_id in group.index:
                    group[prev_user_id] = (
                        np.append(group[prev_user_id][0],
                                  prev_group[prev_user_id][0])[-MAX_SEQ:],
                        np.append(group[prev_user_id][1],
                                  prev_group[prev_user_id][1])[-MAX_SEQ:],
                        np.append(group[prev_user_id][2],
                                  prev_group[prev_user_id][2])[-MAX_SEQ:],
                        np.append(group[prev_user_id][3],
                                  prev_group[prev_user_id][3])[-MAX_SEQ:])

                else:
                    group[prev_user_id] = (prev_group[prev_user_id][0],
                                           prev_group[prev_user_id][1],
                                           prev_group[prev_user_id][2],
                                           prev_group[prev_user_id][3])

        test_df[
            'prior_question_had_explanation'] = test_df.prior_question_had_explanation.fillna(
                False).astype('int8')
        test_df['prior_question_elapsed_time_sakt'] = test_df[
            'prior_question_elapsed_time'] / 3600
        test_df.prior_question_elapsed_time_sakt.fillna(
            prior_question_elapsed_time_mean_sakt, inplace=True)
        test_df.prior_question_elapsed_time_sakt.clip(lower=0,
                                                      upper=16,
                                                      inplace=True)
        test_df['prior_question_elapsed_time_sakt'] = test_df[
            'prior_question_elapsed_time_sakt'].astype(np.int16)
        test_df['prior_question_elapsed_time'].fillna(
            prior_question_elapsed_time_mean, inplace=True)

        test_df = pd.merge(test_df,
                           questions_df[question_cols],
                           left_on='content_id',
                           right_on='question_id',
                           how='left')
        previous_test_df = test_df.copy()
        test_df = test_df[test_df['content_type_id'] == 0].reset_index(
            drop=True)
        test_dataset = TestDataset(group, test_df, skills)
        test_dataloader = DataLoader(test_dataset,
                                     batch_size=51200,
                                     shuffle=False)

        outs = []

        for item in test_dataloader:
            x = item[0].to(device).long()
            p = item[1].to(device).long()
            e_time = item[2].to(device).long()
            target_id = item[3].to(device).long()

            with torch.no_grad():
                output, att_weight = model(x, p, e_time, target_id)
            outs.extend(
                torch.sigmoid(output)[:, -1].view(-1).data.cpu().numpy())
        test_df[TARGET] = 0
        test_df = add_features(test_df,
                               answered_correctly_u_count,
                               answered_correctly_u_sum,
                               elapsed_time_u_sum,
                               explanation_u_sum,
                               timestamp_u,
                               timestamp_u_incorrect,
                               answered_correctly_q_count,
                               answered_correctly_q_sum,
                               elapsed_time_q_sum,
                               explanation_q_sum,
                               answered_correctly_uq,
                               answered_correctly_up_count,
                               answered_correctly_up_sum,
                               update=False)
        test_df[TARGET] = lgb_model.predict(
            test_df[FEATURES]) * 0.6 + np.array(outs) * 0.4
        set_predict(test_df[['row_id', TARGET]])

    print('Job Done')
Esempio n. 15
0
def run(debug, model_dir, kaggle=False):

    if kaggle:
        files_dir = "/kaggle/input/riiid-split10/*.pickle"
    else:
        files_dir = "../input/riiid-test-answer-prediction/split10/*.pickle"

    logger = get_logger()
    # environment
    env = riiideducation.make_env()

    # model loading
    models = []
    for model_path in glob.glob(f"{model_dir}/*model*.pickle"):
        with open(model_path, "rb") as f:
            models.append(pickle.load(f))

    # data preprocessing
    pipeline = Pipeline(logger=logger)
    for model_id, fname in enumerate(glob.glob(files_dir)):
        logger.info(f"loading... {fname}")
        df = pd.read_pickle(fname)
        if debug:
            df = df.head(1000)
        pipeline.fit(df)

    iter_test = env.iter_test()
    df_test_prev = pd.DataFrame()
    i = 0
    t = time.time()
    for (df_test, df_sample_prediction) in iter_test:
        i += 1
        logger.info(
            f"[time: {int(time.time() - t)}iteration {i}: data_length: {len(df_test)}"
        )
        # 前回のデータ更新
        if len(df_test_prev) > 0:
            answered_correctly = df_test.iloc[0]["prior_group_answers_correct"]
            user_answer = df_test.iloc[0]["prior_group_responses"]

            df_test_prev["answered_correctly"] = [
                int(x) for x in answered_correctly.replace("[", "").replace(
                    "'", "").replace("]", "").replace(" ", "").split(",")
            ]
            df_test_prev["user_answer"] = [
                int(x)
                for x in user_answer.replace("[", "").replace("'", "").replace(
                    "]", "").replace(" ", "").split(",")
            ]
            # df_test_prev = df_test_prev.drop(prior_columns, axis=1)

            pipeline.fit(df_test_prev)
        # 今回のデータ取得&計算

        # logger.info(f"[time: {int(time.time() - t)}dataload")
        logger.info(f"transform... ")
        df = pipeline.partial_transform(df_test)
        logger.info(f"other... ")
        cols = models[0].feature_name()
        for col in cols:
            if col not in df.columns:
                df[col] = -99999

        # predict
        predicts = []
        cols = models[0].feature_name()
        for model in models:
            predicts.append(model.predict(df[cols]))

        df["answered_correctly"] = np.array(predicts).transpose().mean(axis=1)
        df_sample_prediction = pd.merge(df_sample_prediction[["row_id"]],
                                        df[["row_id", "answered_correctly"]],
                                        how="inner")
        env.predict(df_sample_prediction)
        df_test_prev = df[cols]