Ejemplo n.º 1
0
def main(params: dict, output_dir: str):
    import mlflow
    print("start params={}".format(params))
    logger = get_logger()
    df = pd.read_pickle(
        "../input/riiid-test-answer-prediction/train_merged.pickle")
    # df = pd.read_pickle("../input/riiid-test-answer-prediction/split10/train_0.pickle").sort_values(["user_id", "timestamp"]).reset_index(drop=True)
    if is_debug:
        df = df.head(500000)
    df["prior_question_had_explanation"] = df[
        "prior_question_had_explanation"].fillna(-1)
    column_config = {
        ("content_id", "content_type_id"): {
            "type": "category"
        },
        "user_answer": {
            "type": "leakage_feature"
        },
        "answered_correctly": {
            "type": "leakage_feature"
        },
        "part": {
            "type": "category"
        },
        "prior_question_elapsed_time_bin300": {
            "type": "category"
        },
        "duration_previous_content_bin300": {
            "type": "category"
        },
        "prior_question_had_explanation": {
            "type": "category"
        },
        "rating_diff_content_user_id": {
            "type": "numeric"
        }
    }

    with open(f"{output_dir}/transformer_param.json", "w") as f:
        json.dump(params, f)
    if is_make_feature_factory:
        # feature factory
        feature_factory_dict = {"user_id": {}}
        feature_factory_dict["user_id"][
            "DurationPreviousContent"] = DurationPreviousContent(
                is_partial_fit=True)
        feature_factory_dict["user_id"][
            "ElapsedTimeBinningEncoder"] = ElapsedTimeBinningEncoder()
        feature_factory_dict["user_id"][
            "UserContentRateEncoder"] = UserContentRateEncoder(
                rate_func="elo", column="user_id")
        feature_factory_manager = FeatureFactoryManager(
            feature_factory_dict=feature_factory_dict,
            logger=logger,
            split_num=1,
            model_id=model_id,
            load_feature=not is_debug,
            save_feature=not is_debug)

        ff_for_transformer = FeatureFactoryForTransformer(
            column_config=column_config,
            dict_path="../feature_engineering/",
            sequence_length=params["max_seq"],
            logger=logger)
        ff_for_transformer.make_dict(df=df)
        df = df.sort_values(["user_id", "timestamp"]).reset_index(drop=True)
        feature_factory_manager.fit(df.iloc[:95000].copy())
        w_df = feature_factory_manager.all_predict(df.iloc[:95000].copy())
        ff_for_transformer.fit(w_df)
        for _, w_df in tqdm(df.iloc[95000:].groupby(
            ["user_id", "task_container_id"])):
            ww_df = feature_factory_manager.partial_predict(
                w_df.drop(["answered_correctly", "user_answer"], axis=1))
            group = ff_for_transformer.partial_predict(ww_df)

            ww_df["answered_correctly"] = w_df["answered_correctly"]
            ww_df["user_answer"] = w_df["user_answer"]
            feature_factory_manager.fit(ww_df)
            ff_for_transformer.fit(ww_df)
        for dicts in feature_factory_manager.feature_factory_dict.values():
            for factory in dicts.values():
                factory.logger = None
        feature_factory_manager.logger = None
        with open(f"{output_dir}/feature_factory_manager.pickle", "wb") as f:
            pickle.dump(feature_factory_manager, f)

        ff_for_transformer.logger = None
        with open(
                f"{output_dir}/feature_factory_manager_for_transformer.pickle",
                "wb") as f:
            pickle.dump(ff_for_transformer, f)
Ejemplo n.º 2
0
def main(params: dict):
    import mlflow
    logger = get_logger()
    print("start params={}".format(params))
    # df = pd.read_pickle("../input/riiid-test-answer-prediction/train_merged.pickle").head(30_000_000)
    df = pd.read_pickle(
        "../input/riiid-test-answer-prediction/split10/train_0.pickle"
    ).sort_values(["user_id", "timestamp"]).reset_index(drop=True)
    if is_debug:
        df = df.head(3000)
        df.to_csv("aaa.csv")
    df = df.sort_values(["user_id", "timestamp"]).reset_index(drop=True)

    df = df[[
        "user_id", "content_id", "content_type_id", "part",
        "answered_correctly"
    ]]

    train_idx = []
    val_idx = []
    np.random.seed(0)
    #    for _, w_df in df[df["content_type_id"] == 0].groupby("user_id"):
    for _, w_df in df.groupby("user_id"):
        if np.random.random() < 0.1:
            # all val
            val_idx.extend(w_df.index.tolist())
        else:
            train_num = int(len(w_df) * 0.9)
            train_idx.extend(w_df[:train_num].index.tolist())
            val_idx.extend(w_df[train_num:].index.tolist())

    df["is_val"] = 0
    df["is_val"].loc[val_idx] = 1

    ff_for_transformer = FeatureFactoryForTransformer(
        column_config={
            ("content_id", "content_type_id"): {
                "type": "category"
            },
            "part": {
                "type": "category"
            }
        },
        dict_path="../feature_engineering/",
        sequence_length=params["max_seq"],
        logger=logger)
    n_skill = len(ff_for_transformer.embbed_dict[("content_id",
                                                  "content_type_id")])

    group = ff_for_transformer.all_predict(df)
    dataset_val = SAKTDataset(group,
                              is_test=True,
                              n_skill=n_skill,
                              max_seq=params["max_seq"])

    dataloader_val = DataLoader(dataset_val,
                                batch_size=1024,
                                shuffle=False,
                                num_workers=1)

    ## all_predict
    xs = []
    target_ids = []
    parts = []
    labels = []
    for d in tqdm(dataloader_val):
        xs.extend(d[0].to(device).long().data.cpu().numpy().tolist())
        target_ids.extend(d[1].to(device).long().data.cpu().numpy().tolist())
        parts.extend(d[2].to(device).long().data.cpu().numpy().tolist())
        labels.extend(d[3].to(device).long().data.cpu().numpy().tolist())
    print(xs[0])
    print(target_ids[0])
    print(parts[0])
    print(labels[0])

    ## partial_predict
    ff_for_transformer = FeatureFactoryForTransformer(
        column_config={
            ("content_id", "content_type_id"): {
                "type": "category"
            },
            "part": {
                "type": "category"
            }
        },
        dict_path="../feature_engineering/",
        sequence_length=params["max_seq"],
        logger=logger)

    ff_for_transformer.fit(df.loc[train_idx])

    xs_test = []
    target_ids_test = []
    parts_test = []
    labels_test = []
    for i in tqdm(range(len(df.loc[val_idx]))):
        w_df = df.loc[val_idx[i:i + 1]]
        group = ff_for_transformer.partial_predict(w_df.copy())

        dataset_val = SAKTDataset(group,
                                  predict_mode=True,
                                  n_skill=n_skill,
                                  max_seq=params["max_seq"])

        dataloader_val = DataLoader(dataset_val,
                                    batch_size=1024,
                                    shuffle=False,
                                    num_workers=1)
        for d in dataloader_val:
            xs_test.extend(d[0].to(device).long().data.cpu().numpy().tolist())
            target_ids_test.extend(
                d[1].to(device).long().data.cpu().numpy().tolist())
            parts_test.extend(
                d[2].to(device).long().data.cpu().numpy().tolist())
            labels_test.extend(
                d[3].to(device).long().data.cpu().numpy().tolist())
        ff_for_transformer.fit(w_df.copy())

    pd.DataFrame(xs).to_csv("all_xs.csv")
    pd.DataFrame(xs_test).to_csv("partial_xs.csv")
    pd.DataFrame(labels).to_csv("all_labels.csv")
    pd.DataFrame(labels_test).to_csv("partial_labels.csv")
    df.loc[val_idx].to_csv("raw_data.csv")
    assert xs[0] == xs_test[0]
    assert xs == xs_test
    assert target_ids == target_ids_test
    assert parts == parts_test