Beispiel #1
0
def preprocess_parquet_files(process_train: bool, process_test: bool) -> None:
    """Read parquet files and convert them into images."""
    if process_train:
        # # train
        print("preprocess train parquet files")
        if os.path.isdir(config.TRAIN_IMAGES_DIR):
            print("train images dir already exists!")
            pass
        else:
            os.mkdir(config.TRAIN_IMAGES_DIR)
            for i in range(config.PARUET_FILE_NUM):
                pqt_file_name = "train_image_data_{}.parquet".format(i)
                parquet_file_path = config.RAW_DATA / pqt_file_name
                with utils.timer(
                        "convert {} to png files.".format(pqt_file_name)):
                    convert_parquet_to_images(parquet_file_path,
                                              config.TRAIN_IMAGES_DIR)

    if process_test:
        # # test
        print("preprocess test parquet files")
        if os.path.isdir(config.TEST_IMAGES_DIR):
            print("test images dir already exists!")
            pass
        else:
            os.mkdir(config.TEST_IMAGES_DIR)
            for i in range(config.PARUET_FILE_NUM):
                pqt_file_name = "test_image_data_{}.parquet".format(i)
                parquet_file_path = config.RAW_DATA / pqt_file_name
                with utils.timer(
                        "convert {} to png files.".format(pqt_file_name)):
                    convert_parquet_to_images(parquet_file_path,
                                              config.TEST_IMAGES_DIR)
Beispiel #2
0
def preprocess_meta_info_files(process_train: bool,
                               process_test: bool) -> None:
    """Preprocess Train and Test Meta Info."""
    if process_train:
        with utils.timer("preprocess train meta file"):
            train = pd.read_csv(config.RAW_DATA / "train.csv")
            # # K-fold split.
            train["character_id"] = train.apply(
                lambda row: "{:0>3}_{:0>2}_{}".format(
                    row["grapheme_root"], row["vowel_diacritic"], row[
                        "consonant_diacritic"]),
                axis=1)

            labels_arr = pd.get_dummies(train[[
                "grapheme_root", "vowel_diacritic", "consonant_diacritic"
            ]],
                                        columns=[
                                            "grapheme_root", "vowel_diacritic",
                                            "consonant_diacritic"
                                        ]).values

            train["fold"] = -1
            for fold_id, (train_idx, valid_idx) in enumerate(
                    utils.multi_label_stratified_group_k_fold(
                        train.character_id.values, labels_arr, config.FOLD_NUM,
                        config.RANDAM_SEED)):
                train.loc[valid_idx, "fold"] = fold_id

            train.to_csv(config.PROC_DATA /
                         "train_add-{}fold-index.csv".format(config.FOLD_NUM),
                         index=False)

    if process_test:
        with utils.timer("preprocess test meta file"):
            test = pd.read_csv(config.RAW_DATA / "test.csv")
            test_proc = pd.DataFrame(
                {"image_id": test.image_id.drop_duplicates().values})
            test_proc["grapheme_root"] = 0
            test_proc["vowel_diacritic"] = 0
            test_proc["consonant_diacritic"] = 0
            test_proc.to_csv(config.PROC_DATA / "test_reshaped.csv",
                             index=False)
def inference(trained_path: PosixPath,
              output_path: PosixPath,
              epoch_of_model: int = -1,
              gpu_device: int = -1,
              batch_size: int = 64,
              inference_valid: bool = False):
    """Inference function for kernel."""
    # # read settings from training outputs directory.
    with open((trained_path / "settings.yml").as_posix(), "r") as fr:
        settings = yaml.safe_load(fr)

    # # make dataset
    # # # read meta info.
    with utils.timer("make val dataset"):
        val_dataset = test_dataset = None
        if inference_valid:
            train_df = pd.read_csv(
                config.PROC_DATA /
                "train_add-{}fold-index.csv".format(settings["n_folds"]))
            # # # # make label arr
            train_labels_arr = train_df[config.COMP_NAMES].values.astype("i")

            # # # # make chainer dataset
            val_dataset = datasets.LabeledImageDataset(
                pairs=list(
                    zip((train_df[train_df["fold"] == settings["val_fold"]]
                         ["image_id"] + ".png").tolist(), train_labels_arr[
                             train_df["fold"] == settings["val_fold"], ...])),
                root=config.TRAIN_IMAGES_DIR.as_posix())
            # # # # set transform
            val_dataset = datasets.TransformDataset(
                val_dataset,
                nn_training.ImageTransformer(settings["inference_transforms"]))

    # # # test set
    with utils.timer("make test dataset"):
        test_df = pd.read_csv(config.PROC_DATA / "test_reshaped.csv")
        sample_sub = pd.read_csv(config.RAW_DATA / "sample_submission.csv")

        # # # # make chainer dataset
        test_dataset = datasets.LabeledImageDataset(
            pairs=list(
                zip((test_df["image_id"] + ".png").tolist(),
                    ([-1] * len(test_df)))),
            root=config.TEST_IMAGES_DIR.as_posix())
        # # # # set transform
        test_dataset = datasets.TransformDataset(
            test_dataset,
            nn_training.ImageTransformer(settings["inference_transforms"]))

    with utils.timer("init and load model"):
        # # initialize model.
        settings["backborn_kwargs"]["pretrained_model_path"] = None
        model = nn_training.ImageClassificationModel(
            extractor=getattr(
                backborn_chains,
                settings["backborn_class"])(**settings["backborn_kwargs"]),
            global_pooling=None if settings["pooling_class"] is None else
            getattr(global_pooling_chains, settings["pooling_class"])(
                **settings["pooling_kwargs"]),
            classifier=getattr(
                classifer_chains,
                settings["head_class"])(**settings["head_kwargs"]))
        # # load model.
        model_path = trained_path / "model_snapshot_{}.npz".format(
            epoch_of_model)
        print(model_path)
        if not (epoch_of_model != -1 and os.path.isfile(model_path)):
            model_path = trained_path / "model_snapshot_last_epoch.npz"

        print("use model: {}".format(model_path))

        serializers.load_npz(model_path, model)
        if gpu_device != -1:
            model.to_gpu(gpu_device)
        gc.collect()

    settings["batch_size"] = batch_size
    _, val_iter, test_iter = nn_training.create_iterator(
        settings, None, val_dataset, test_dataset)

    if inference_valid:
        with utils.timer("inference validation set"):
            val_pred, val_label = nn_training.inference_test_data(
                model, val_iter, gpu_device=gpu_device)
            np.save(
                output_path /
                "val_pred_arr_fold{}".format(settings["val_fold"]), val_pred)
            # # calc score
            score_list = [[] for i in range(2)]

            for i in range(len(config.N_CLASSES)):
                y_pred_subset = val_pred[:, config.COMP_INDEXS[i]:config.
                                         COMP_INDEXS[i + 1]].argmax(axis=1)
                y_true_subset = val_label[:, i]
                score_list[0].append(
                    recall_score(y_true_subset,
                                 y_pred_subset,
                                 average='macro',
                                 zero_division=0))
                score_list[1].append(
                    recall_score(y_true_subset,
                                 y_pred_subset,
                                 average='macro',
                                 zero_division=1))

            del val_dataset
            del val_iter
            del val_pred
            del val_label
            del y_pred_subset
            del y_true_subset

            gc.collect()
            score_list[0].append(np.average(score_list[0], weights=[2, 1, 1]))
            score_list[1].append(np.average(score_list[1], weights=[2, 1, 1]))

            score_df = pd.DataFrame(score_list,
                                    columns=config.COMP_NAMES + ["score"])

            print("[score for validation set]")
            print(score_df)
            score_df.to_csv(output_path / "score.csv", index=False)

    with utils.timer("inference test set"):
        test_pred, test_label = nn_training.inference_test_data(
            model, test_iter, gpu_device=gpu_device)
        del test_label

        np.save(
            output_path / "test_pred_arr_fold{}".format(settings["val_fold"]),
            test_pred)

    with utils.timer("make submission"):
        # # # arg max for each component.
        for i, c_name in enumerate(config.COMP_NAMES):
            test_pred_subset = test_pred[:, config.COMP_INDEXS[i]:config.
                                         COMP_INDEXS[i + 1]].argmax(axis=1)
            test_df[c_name] = test_pred_subset

        del test_pred
        gc.collect()

        # # # reshape test_df to submisson format.
        melt_df = pd.melt(test_df,
                          id_vars="image_id",
                          value_vars=config.COMP_NAMES,
                          value_name="target")
        melt_df["row_id"] = melt_df["image_id"] + "_" + melt_df["variable"]

        submission_df = pd.merge(sample_sub[["row_id"]],
                                 melt_df[["row_id", "target"]],
                                 on="row_id",
                                 how="left")

        submission_df.to_csv(output_path / "submission.csv", index=False)
def inference_by_snapshot_ensemble(trained_path: PosixPath,
                                   output_path: PosixPath,
                                   gpu_device: int = -1,
                                   batch_size: int = 64):
    """Inference function for kernel."""
    # # read settings from training outputs directory.
    with open((trained_path / "settings.yml").as_posix(), "r") as fr:
        settings = yaml.safe_load(fr)

    # # make dataset
    # # # test set
    with utils.timer("make test dataset"):
        test_df = pd.read_csv(config.PROC_DATA / "test_reshaped.csv")
        sample_sub = pd.read_csv(config.RAW_DATA / "sample_submission.csv")

        # # # # make chainer dataset
        test_dataset = datasets.LabeledImageDataset(
            pairs=list(
                zip((test_df["image_id"] + ".png").tolist(),
                    ([-1] * len(test_df)))),
            root=config.TEST_IMAGES_DIR.as_posix())
        # # # # set transform
        test_dataset = datasets.TransformDataset(
            test_dataset,
            nn_training.ImageTransformer(settings["inference_transforms"]))

    # # # prepare model paths
    model_path_list = []
    model_weight = []
    for epoch_of_model in range(settings["epoch_per_cycle"],
                                settings["max_epoch"] + 1,
                                settings["epoch_per_cycle"]):
        model_path = trained_path / "model_snapshot_{}.npz".format(
            epoch_of_model)
        if os.path.isfile(model_path):
            model_path_list.append(model_path)
            model_weight.append(1)

    if len(model_path_list) == 0:
        model_path_list.append(trained_path / "model_snapshot_last_epoch.npz")
        model_weight.append(1)
    print("[using models]")
    print(model_path_list)

    # # # prepare preds numpy.ndarray of shape: (n_model, n_test, n_class)
    test_preds_arr = np.zeros(
        (len(model_path_list), len(test_df), sum(config.N_CLASSES)), dtype="f")

    # # inference
    with utils.timer("inference test set"):
        for idx, model_path in enumerate(model_path_list):
            # # # create iterator.
            test_iter = nn_training.create_iterator(settings, None, None,
                                                    test_dataset)[-1]
            # # # init and load model
            model = init_model(settings)
            serializers.load_npz(model_path, model)
            # # # move model to gpu
            model.to_gpu(gpu_device)
            # # # inference
            test_preds_arr[idx] = nn_training.inference_test_data(
                model, test_iter, gpu_device=gpu_device)[0]
            del test_iter
            del model
            gc.collect()
        del test_dataset

    np.save(
        output_path / "test_all_preds_arr_fold{}".format(settings["val_fold"]),
        test_preds_arr)

    # # ensemble (weighted averaging)
    with utils.timer("snapshot ensemble"):
        # # # convert logits to probs
        for i in range(len(config.N_CLASSES)):
            test_preds_arr[..., config.COMP_INDEXS[i]:config.COMP_INDEXS[i + 1]] =\
                functions.softmax(test_preds_arr[..., config.COMP_INDEXS[i]:config.COMP_INDEXS[i + 1]]).data

        test_pred = np.average(test_preds_arr, axis=0, weights=model_weight)
        np.save(
            output_path / "test_pred_arr_fold{}".format(settings["val_fold"]),
            test_pred)

    with utils.timer("make submission"):
        # # convert prob to pred id
        for i, c_name in enumerate(config.COMP_NAMES):
            test_pred_subset = test_pred[:, config.COMP_INDEXS[i]:config.
                                         COMP_INDEXS[i + 1]].argmax(axis=1)
            test_df[c_name] = test_pred_subset

        del test_pred_subset
        del test_pred
        gc.collect()

        # # # reshape test_df to submisson format.
        melt_df = pd.melt(test_df,
                          id_vars="image_id",
                          value_vars=config.COMP_NAMES,
                          value_name="target")
        melt_df["row_id"] = melt_df["image_id"] + "_" + melt_df["variable"]

        submission_df = pd.merge(sample_sub[["row_id"]],
                                 melt_df[["row_id", "target"]],
                                 on="row_id",
                                 how="left")

        submission_df.to_csv(output_path / "submission.csv", index=False)