def get_kiccho_embeddings(
    exp: str,
    test_df: pd.DataFrame,
    num_workers: Optional[int] = None,
    batch_size: Optional[int] = None,
    image_dir: Optional[str] = None,
) -> Tuple[np.ndarray, np.ndarray, str, np.ndarray]:
    """
    input
        exp: 実験名。expxxx.yamlのinference_configに使うモデルの情報を書く
        test_df: test.csvをpd.read_csvで読んだもの

    output
        embeddings: Dict[str, np.ndarray]
            key: 実験名 (exp001とか)
            value: embedding. shapeは ( len(test_df) x linear_out )
    """
    args = ArgsUtil.get_args(EnvEnum.KAGGLE, exp, [])

    print(args)
    config = ConfigFactory.get_config_from_yaml_file(args.exp, args.env, True)
    data = DataFactory.load_data(config)

    data.test = test_df
    data, config = Pp.image_path(data, config)
    data, config = Pp.label_group_le(data, config)
    data, config = Pp.split_folds(data, config)
    data, config = Pp.kurupical_fold(data, config)

    if image_dir is not None:
        data.test["image_path"] = data.test["image"].map(
            lambda i: f"{image_dir}/{i}")

    features = []
    img_features = []
    txt_features = []
    for epoch_config in config.inference_config.epoch_configs:
        _config = ConfigFactory.get_config_from_yaml_file(
            epoch_config.dataloader_exp, args.env, False)
        test_dataloader = DataLoaderFactory.get_test_dataloader(
            data, _config, num_workers=num_workers, batch_size=batch_size)
        _features, _img_features, _txt_features = InferenceFactory.epoch(
            args.env, epoch_config, test_dataloader, data)
        features += _features
        img_features += _img_features
        txt_features += _txt_features
        del _features
        del _img_features
        del _txt_features
        gc.collect()
    for i in range(len(features)):
        features[i] = np.concatenate(features[i])
        img_features[i] = np.concatenate(img_features[i])
        txt_features[i] = np.concatenate(txt_features[i])
        print(f"features[{i}].shape:", features[i].shape)
        print(f"img_features[{i}].shape:", img_features[i].shape)
        print(f"txt_features[{i}].shape:", txt_features[i].shape)

    exps: List[str] = []
    for epoch_config in config.inference_config.epoch_configs:
        for model_checkpoint in epoch_config.model_checkpoints:
            _exp = model_checkpoint.split("_")[0]
            exps.append(_exp)

    TestUtil.assert_any(len(exps), len(features))

    return features[0], img_features[0], txt_features[0]
Example #2
0
# %%
import sys
from pathlib import Path

sys.path.append(str(Path(__file__).parents[2]))


from kaggle_shopee.factories.config_factory import ConfigFactory, EnvEnum
from kaggle_shopee.factories.data_factory import Data, DataFactory
from kaggle_shopee.factories.preprocessing import Pp
from kaggle_shopee.utils.args_util import ArgsUtil

args = ArgsUtil.get_args(EnvEnum.LOCAL, "exp003", [])
config = ConfigFactory.get_config_from_yaml_file(args.exp, args.env, False)
data = DataFactory.load_data(config)
data, config = Pp.main(data, config)

# %%
import re

import pandas as pd

pd.set_option("display.max_colwidth", None)
unit = [
    "GR",
    "GM",
    "KG",
    "KILO",
    "MG",
    "LITRE",
    "ML",
Example #3
0
sys.path.append(str(Path(__file__).parents[2]))

import numpy as np
import torch
import torch.cuda
from kaggle_shopee.factories.config_factory import Config, ConfigFactory, EnvEnum
from kaggle_shopee.factories.data_factory import Data, DataFactory
from kaggle_shopee.factories.dataloader_factory import DataLoaderFactory
from kaggle_shopee.factories.inference_factory import InferenceFactory
from kaggle_shopee.factories.preprocessing import Pp
from kaggle_shopee.utils.args_util import ArgsUtil
from kaggle_shopee.utils.file_util import FileUtil
from kaggle_shopee.utils.global_util import GlobalUtil
from kaggle_shopee.utils.mlflow_util import MlflowUtil

args = ArgsUtil.get_args()
print(args)

config = ConfigFactory.get_config_from_yaml_file(args.exp, args.env, True)
print(config.inference_config)
GlobalUtil.seed_everything(config.seed)
data = DataFactory.load_data(config)
data, config = Pp.main(data, config)

MlflowUtil.start_run(config.mlflow_config, config.exp, config.name, True)
MlflowUtil.log_params_config(config)
for fold in range(config.cv_config.n_splits):
    if fold not in args.folds:
        continue
    print(f"======================= fold {fold} =======================")
    features = []
        posting_ids = train_df["posting_id"].values
        batch_idxs = get_batch_idxs(len(posting_ids), 20)
        positives, positive_dict = get_positives(train_df)
        negatives = get_negatives(posting_ids, features, batch_idxs,
                                  positive_dict, num_negatives)
        positive_df = pd.DataFrame(positives,
                                   columns=["posting_id", "p_posting_id"])
        negative_df = pd.DataFrame(negatives,
                                   columns=["posting_id", "n_posting_id"])
        positive_dict = (positive_df.groupby("posting_id")
                         ["p_posting_id"].unique().to_dict())
        negative_dict = (negative_df.groupby("posting_id")
                         ["n_posting_id"].unique().to_dict())
        del positive_df
        del negative_df
        return positive_dict, negative_dict


if __name__ == "__main__":
    offline_mining_exp = "exp373"
    epoch = 9
    fold = 0
    num_negatives = 3

    args = ArgsUtil.get_args(EnvEnum.COLAB, "exp383", [0])
    print(args)
    config = ConfigFactory.get_config_from_yaml_file(args.exp, env=args.env)

    positive_dict, negative_dict = MiningFactory.get_triplets(
        config.dir_config, offline_mining_exp, epoch, fold, num_negatives)
Example #5
0
def get_kiccho_embeddings(
    exp: str,
    test_df: pd.DataFrame,
    num_workers: Optional[int] = None,
    batch_size: Optional[int] = None,
    image_dir: Optional[str] = None,
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    """
    input
        exp: 実験名。expxxx.yamlのinference_configに使うモデルの情報を書く
        test_df: test.csvをpd.read_csvで読んだもの

    output
        all_embeddings, img_embeddings, text_embeddings
    """
    args = ArgsUtil.get_args(EnvEnum.KAGGLE, exp, [])

    device = "cuda"

    print(args)
    config = ConfigFactory.get_config_from_yaml_file(args.exp,
                                                     args.env,
                                                     verbose=False)
    data = DataFactory.load_data(config)

    data.test = test_df.copy()
    data, config = Pp.image_path(data, config)
    data, config = Pp.label_group_le(data, config)
    data, config = Pp.split_folds(data, config)
    data, config = Pp.kurupical_fold(data, config)

    if image_dir is not None:
        data.test["image_path"] = data.test["image"].map(
            lambda i: f"{image_dir}/{i}")

    model_checkpoint = config.inference_config.epoch_configs[
        0].model_checkpoints[0]
    if args.env == EnvEnum.KAGGLE:
        model_checkpoint = model_checkpoint.replace("=", "")

    print("load model:", model_checkpoint)
    model = lit_models.ShopeeLitModel.load_from_checkpoint(
        str(config.dir_config.checkpoint_dir / model_checkpoint),
        data=data,
        config=config,
        fold=-1,
        with_mlflow=False,
        bert_path=str(config.dir_config.dataset_dir /
                      config.model_config.bert_model_arch),
        is_test=True,
    ).model.to(device)
    model.eval()

    test_dataloader = DataLoaderFactory.get_test_dataloader(
        data, config, num_workers=num_workers, batch_size=batch_size)

    img_features = []
    text_features = []
    all_features = []
    with torch.no_grad():
        for batch in tqdm(test_dataloader):
            img = batch["img"].to(device)
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            all_feature, img_feature, text_feature = model(
                img, input_ids, attention_mask)

            all_features.extend(all_feature.detach().cpu().numpy().astype(
                np.float16))
            img_features.extend(img_feature.detach().cpu().numpy().astype(
                np.float16))
            text_features.extend(text_feature.detach().cpu().numpy().astype(
                np.float16))

    img_features = np.array(img_features, dtype=np.float16)
    text_features = np.array(text_features, dtype=np.float16)
    all_features = np.array(all_features, dtype=np.float16)

    del data
    del model
    del test_dataloader.dataset
    del test_dataloader
    gc.collect()
    torch.cuda.empty_cache()
    return all_features, img_features, text_features
Example #6
0
        data: Data, config: Config
    ) -> Tuple[Data, Config]:
        _map = data.train.groupby("image_phash")["posting_id"].unique()
        data.train["image_phash_match_posting_ids"] = data.train["image_phash"].map(
            _map
        )
        return data, config

    @staticmethod
    @TimeUtil.timer_wrapper
    def main(data: Data, config: Config) -> Tuple[Data, Config]:
        data, config = Fe.image_phash_match_posting_ids(data, config)
        return data, config


args = ArgsUtil.get_args(env=EnvEnum.LOCAL, exp="exp003")
print(args)

config = ConfigFactory.get_config_from_yaml_file(args.exp)
GlobalUtil.seed_everything(config.seed)
data = DataFactory.load_data(config)
if len(data.test) > 3:
    config.is_submitting = True
data, config = Pp.main(data, config)
# data, config = Fe.main(data, config)

# %%
data.sample_submission


# %%