def get_kiccho_embeddings( exp: str, test_df: pd.DataFrame, num_workers: Optional[int] = None, batch_size: Optional[int] = None, image_dir: Optional[str] = None, ) -> Tuple[np.ndarray, np.ndarray, str, np.ndarray]: """ input exp: 実験名。expxxx.yamlのinference_configに使うモデルの情報を書く test_df: test.csvをpd.read_csvで読んだもの output embeddings: Dict[str, np.ndarray] key: 実験名 (exp001とか) value: embedding. shapeは ( len(test_df) x linear_out ) """ args = ArgsUtil.get_args(EnvEnum.KAGGLE, exp, []) print(args) config = ConfigFactory.get_config_from_yaml_file(args.exp, args.env, True) data = DataFactory.load_data(config) data.test = test_df data, config = Pp.image_path(data, config) data, config = Pp.label_group_le(data, config) data, config = Pp.split_folds(data, config) data, config = Pp.kurupical_fold(data, config) if image_dir is not None: data.test["image_path"] = data.test["image"].map( lambda i: f"{image_dir}/{i}") features = [] img_features = [] txt_features = [] for epoch_config in config.inference_config.epoch_configs: _config = ConfigFactory.get_config_from_yaml_file( epoch_config.dataloader_exp, args.env, False) test_dataloader = DataLoaderFactory.get_test_dataloader( data, _config, num_workers=num_workers, batch_size=batch_size) _features, _img_features, _txt_features = InferenceFactory.epoch( args.env, epoch_config, test_dataloader, data) features += _features img_features += _img_features txt_features += _txt_features del _features del _img_features del _txt_features gc.collect() for i in range(len(features)): features[i] = np.concatenate(features[i]) img_features[i] = np.concatenate(img_features[i]) txt_features[i] = np.concatenate(txt_features[i]) print(f"features[{i}].shape:", features[i].shape) print(f"img_features[{i}].shape:", img_features[i].shape) print(f"txt_features[{i}].shape:", txt_features[i].shape) exps: List[str] = [] for epoch_config in config.inference_config.epoch_configs: for model_checkpoint in epoch_config.model_checkpoints: _exp = model_checkpoint.split("_")[0] exps.append(_exp) TestUtil.assert_any(len(exps), len(features)) return features[0], img_features[0], txt_features[0]
# %% import sys from pathlib import Path sys.path.append(str(Path(__file__).parents[2])) from kaggle_shopee.factories.config_factory import ConfigFactory, EnvEnum from kaggle_shopee.factories.data_factory import Data, DataFactory from kaggle_shopee.factories.preprocessing import Pp from kaggle_shopee.utils.args_util import ArgsUtil args = ArgsUtil.get_args(EnvEnum.LOCAL, "exp003", []) config = ConfigFactory.get_config_from_yaml_file(args.exp, args.env, False) data = DataFactory.load_data(config) data, config = Pp.main(data, config) # %% import re import pandas as pd pd.set_option("display.max_colwidth", None) unit = [ "GR", "GM", "KG", "KILO", "MG", "LITRE", "ML",
sys.path.append(str(Path(__file__).parents[2])) import numpy as np import torch import torch.cuda from kaggle_shopee.factories.config_factory import Config, ConfigFactory, EnvEnum from kaggle_shopee.factories.data_factory import Data, DataFactory from kaggle_shopee.factories.dataloader_factory import DataLoaderFactory from kaggle_shopee.factories.inference_factory import InferenceFactory from kaggle_shopee.factories.preprocessing import Pp from kaggle_shopee.utils.args_util import ArgsUtil from kaggle_shopee.utils.file_util import FileUtil from kaggle_shopee.utils.global_util import GlobalUtil from kaggle_shopee.utils.mlflow_util import MlflowUtil args = ArgsUtil.get_args() print(args) config = ConfigFactory.get_config_from_yaml_file(args.exp, args.env, True) print(config.inference_config) GlobalUtil.seed_everything(config.seed) data = DataFactory.load_data(config) data, config = Pp.main(data, config) MlflowUtil.start_run(config.mlflow_config, config.exp, config.name, True) MlflowUtil.log_params_config(config) for fold in range(config.cv_config.n_splits): if fold not in args.folds: continue print(f"======================= fold {fold} =======================") features = []
posting_ids = train_df["posting_id"].values batch_idxs = get_batch_idxs(len(posting_ids), 20) positives, positive_dict = get_positives(train_df) negatives = get_negatives(posting_ids, features, batch_idxs, positive_dict, num_negatives) positive_df = pd.DataFrame(positives, columns=["posting_id", "p_posting_id"]) negative_df = pd.DataFrame(negatives, columns=["posting_id", "n_posting_id"]) positive_dict = (positive_df.groupby("posting_id") ["p_posting_id"].unique().to_dict()) negative_dict = (negative_df.groupby("posting_id") ["n_posting_id"].unique().to_dict()) del positive_df del negative_df return positive_dict, negative_dict if __name__ == "__main__": offline_mining_exp = "exp373" epoch = 9 fold = 0 num_negatives = 3 args = ArgsUtil.get_args(EnvEnum.COLAB, "exp383", [0]) print(args) config = ConfigFactory.get_config_from_yaml_file(args.exp, env=args.env) positive_dict, negative_dict = MiningFactory.get_triplets( config.dir_config, offline_mining_exp, epoch, fold, num_negatives)
def get_kiccho_embeddings( exp: str, test_df: pd.DataFrame, num_workers: Optional[int] = None, batch_size: Optional[int] = None, image_dir: Optional[str] = None, ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: """ input exp: 実験名。expxxx.yamlのinference_configに使うモデルの情報を書く test_df: test.csvをpd.read_csvで読んだもの output all_embeddings, img_embeddings, text_embeddings """ args = ArgsUtil.get_args(EnvEnum.KAGGLE, exp, []) device = "cuda" print(args) config = ConfigFactory.get_config_from_yaml_file(args.exp, args.env, verbose=False) data = DataFactory.load_data(config) data.test = test_df.copy() data, config = Pp.image_path(data, config) data, config = Pp.label_group_le(data, config) data, config = Pp.split_folds(data, config) data, config = Pp.kurupical_fold(data, config) if image_dir is not None: data.test["image_path"] = data.test["image"].map( lambda i: f"{image_dir}/{i}") model_checkpoint = config.inference_config.epoch_configs[ 0].model_checkpoints[0] if args.env == EnvEnum.KAGGLE: model_checkpoint = model_checkpoint.replace("=", "") print("load model:", model_checkpoint) model = lit_models.ShopeeLitModel.load_from_checkpoint( str(config.dir_config.checkpoint_dir / model_checkpoint), data=data, config=config, fold=-1, with_mlflow=False, bert_path=str(config.dir_config.dataset_dir / config.model_config.bert_model_arch), is_test=True, ).model.to(device) model.eval() test_dataloader = DataLoaderFactory.get_test_dataloader( data, config, num_workers=num_workers, batch_size=batch_size) img_features = [] text_features = [] all_features = [] with torch.no_grad(): for batch in tqdm(test_dataloader): img = batch["img"].to(device) input_ids = batch["input_ids"].to(device) attention_mask = batch["attention_mask"].to(device) all_feature, img_feature, text_feature = model( img, input_ids, attention_mask) all_features.extend(all_feature.detach().cpu().numpy().astype( np.float16)) img_features.extend(img_feature.detach().cpu().numpy().astype( np.float16)) text_features.extend(text_feature.detach().cpu().numpy().astype( np.float16)) img_features = np.array(img_features, dtype=np.float16) text_features = np.array(text_features, dtype=np.float16) all_features = np.array(all_features, dtype=np.float16) del data del model del test_dataloader.dataset del test_dataloader gc.collect() torch.cuda.empty_cache() return all_features, img_features, text_features
data: Data, config: Config ) -> Tuple[Data, Config]: _map = data.train.groupby("image_phash")["posting_id"].unique() data.train["image_phash_match_posting_ids"] = data.train["image_phash"].map( _map ) return data, config @staticmethod @TimeUtil.timer_wrapper def main(data: Data, config: Config) -> Tuple[Data, Config]: data, config = Fe.image_phash_match_posting_ids(data, config) return data, config args = ArgsUtil.get_args(env=EnvEnum.LOCAL, exp="exp003") print(args) config = ConfigFactory.get_config_from_yaml_file(args.exp) GlobalUtil.seed_everything(config.seed) data = DataFactory.load_data(config) if len(data.test) > 3: config.is_submitting = True data, config = Pp.main(data, config) # data, config = Fe.main(data, config) # %% data.sample_submission # %%