Ejemplo n.º 1
0
# %%
import sys
from pathlib import Path

sys.path.append(str(Path(__file__).parents[2]))


from kaggle_shopee.factories.config_factory import ConfigFactory, EnvEnum
from kaggle_shopee.factories.data_factory import Data, DataFactory
from kaggle_shopee.factories.preprocessing import Pp
from kaggle_shopee.utils.args_util import ArgsUtil

args = ArgsUtil.get_args(EnvEnum.LOCAL, "exp003", [])
config = ConfigFactory.get_config_from_yaml_file(args.exp, args.env, False)
data = DataFactory.load_data(config)
data, config = Pp.main(data, config)

# %%
import re

import pandas as pd

pd.set_option("display.max_colwidth", None)
unit = [
    "GR",
    "GM",
    "KG",
    "KILO",
    "MG",
    "LITRE",
    "ML",
def get_kiccho_embeddings(
    exp: str,
    test_df: pd.DataFrame,
    num_workers: Optional[int] = None,
    batch_size: Optional[int] = None,
    image_dir: Optional[str] = None,
) -> Tuple[np.ndarray, np.ndarray, str, np.ndarray]:
    """
    input
        exp: 実験名。expxxx.yamlのinference_configに使うモデルの情報を書く
        test_df: test.csvをpd.read_csvで読んだもの

    output
        embeddings: Dict[str, np.ndarray]
            key: 実験名 (exp001とか)
            value: embedding. shapeは ( len(test_df) x linear_out )
    """
    args = ArgsUtil.get_args(EnvEnum.KAGGLE, exp, [])

    print(args)
    config = ConfigFactory.get_config_from_yaml_file(args.exp, args.env, True)
    data = DataFactory.load_data(config)

    data.test = test_df
    data, config = Pp.image_path(data, config)
    data, config = Pp.label_group_le(data, config)
    data, config = Pp.split_folds(data, config)
    data, config = Pp.kurupical_fold(data, config)

    if image_dir is not None:
        data.test["image_path"] = data.test["image"].map(
            lambda i: f"{image_dir}/{i}")

    features = []
    img_features = []
    txt_features = []
    for epoch_config in config.inference_config.epoch_configs:
        _config = ConfigFactory.get_config_from_yaml_file(
            epoch_config.dataloader_exp, args.env, False)
        test_dataloader = DataLoaderFactory.get_test_dataloader(
            data, _config, num_workers=num_workers, batch_size=batch_size)
        _features, _img_features, _txt_features = InferenceFactory.epoch(
            args.env, epoch_config, test_dataloader, data)
        features += _features
        img_features += _img_features
        txt_features += _txt_features
        del _features
        del _img_features
        del _txt_features
        gc.collect()
    for i in range(len(features)):
        features[i] = np.concatenate(features[i])
        img_features[i] = np.concatenate(img_features[i])
        txt_features[i] = np.concatenate(txt_features[i])
        print(f"features[{i}].shape:", features[i].shape)
        print(f"img_features[{i}].shape:", img_features[i].shape)
        print(f"txt_features[{i}].shape:", txt_features[i].shape)

    exps: List[str] = []
    for epoch_config in config.inference_config.epoch_configs:
        for model_checkpoint in epoch_config.model_checkpoints:
            _exp = model_checkpoint.split("_")[0]
            exps.append(_exp)

    TestUtil.assert_any(len(exps), len(features))

    return features[0], img_features[0], txt_features[0]
Ejemplo n.º 3
0
def get_kiccho_embeddings(
    exp: str,
    test_df: pd.DataFrame,
    num_workers: Optional[int] = None,
    batch_size: Optional[int] = None,
    image_dir: Optional[str] = None,
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    """
    input
        exp: 実験名。expxxx.yamlのinference_configに使うモデルの情報を書く
        test_df: test.csvをpd.read_csvで読んだもの

    output
        all_embeddings, img_embeddings, text_embeddings
    """
    args = ArgsUtil.get_args(EnvEnum.KAGGLE, exp, [])

    device = "cuda"

    print(args)
    config = ConfigFactory.get_config_from_yaml_file(args.exp,
                                                     args.env,
                                                     verbose=False)
    data = DataFactory.load_data(config)

    data.test = test_df.copy()
    data, config = Pp.image_path(data, config)
    data, config = Pp.label_group_le(data, config)
    data, config = Pp.split_folds(data, config)
    data, config = Pp.kurupical_fold(data, config)

    if image_dir is not None:
        data.test["image_path"] = data.test["image"].map(
            lambda i: f"{image_dir}/{i}")

    model_checkpoint = config.inference_config.epoch_configs[
        0].model_checkpoints[0]
    if args.env == EnvEnum.KAGGLE:
        model_checkpoint = model_checkpoint.replace("=", "")

    print("load model:", model_checkpoint)
    model = lit_models.ShopeeLitModel.load_from_checkpoint(
        str(config.dir_config.checkpoint_dir / model_checkpoint),
        data=data,
        config=config,
        fold=-1,
        with_mlflow=False,
        bert_path=str(config.dir_config.dataset_dir /
                      config.model_config.bert_model_arch),
        is_test=True,
    ).model.to(device)
    model.eval()

    test_dataloader = DataLoaderFactory.get_test_dataloader(
        data, config, num_workers=num_workers, batch_size=batch_size)

    img_features = []
    text_features = []
    all_features = []
    with torch.no_grad():
        for batch in tqdm(test_dataloader):
            img = batch["img"].to(device)
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            all_feature, img_feature, text_feature = model(
                img, input_ids, attention_mask)

            all_features.extend(all_feature.detach().cpu().numpy().astype(
                np.float16))
            img_features.extend(img_feature.detach().cpu().numpy().astype(
                np.float16))
            text_features.extend(text_feature.detach().cpu().numpy().astype(
                np.float16))

    img_features = np.array(img_features, dtype=np.float16)
    text_features = np.array(text_features, dtype=np.float16)
    all_features = np.array(all_features, dtype=np.float16)

    del data
    del model
    del test_dataloader.dataset
    del test_dataloader
    gc.collect()
    torch.cuda.empty_cache()
    return all_features, img_features, text_features