def title_tfidf_reduced(data: Data, config: Config) -> Tuple[Data, Config]:
        tfidf = TfidfVectorizer(min_df=10)
        _df = pd.concat([data.train, data.test], axis=0).reset_index(drop=True)
        tfidf.fit(_df["title"])
        _train_encoded = tfidf.transform(data.train["title"]).todense()
        _test_encoded = tfidf.transform(data.test["title"]).todense()
        _encoded = np.concatenate([_train_encoded, _test_encoded], axis=0)
        TestUtil.assert_any(_train_encoded.shape[1], _test_encoded.shape[1])
        TestUtil.assert_any(_train_encoded.shape[1], _encoded.shape[1])

        if config.title_tfidf_reducer == ReducerEnum.NOTHING:
            config.title_tfidf_n_components = _encoded.shape[1]
            for i in tqdm(range(config.title_tfidf_n_components)):
                data.train[f"title_tfidf_{i}"] = _train_encoded[:, i]
                data.test[f"title_tfidf_{i}"] = _test_encoded[:, i]
            return data, config

        if config.title_tfidf_reducer == ReducerEnum.PCA:
            reduce_model = PCA(n_components=config.title_tfidf_n_components)
        else:
            reduce_model = PCA(n_components=config.title_tfidf_n_components)

        reduce_model.fit(_encoded)
        _train_reduced = reduce_model.transform(_train_encoded)
        _test_reduced = reduce_model.transform(_test_encoded)
        TestUtil.assert_any(_train_reduced.shape[1], _test_reduced.shape[1])
        TestUtil.assert_any(config.title_tfidf_n_components,
                            _train_reduced.shape[1])
        for i in tqdm(range(config.title_tfidf_n_components)):
            data.train[f"title_tfidf_{i}"] = _train_reduced[:, i]
            data.test[f"title_tfidf_{i}"] = _test_reduced[:, i]
        return data, config
Ejemplo n.º 2
0
 def image_path(data: Data, config: Config) -> Tuple[Data, Config]:
     data.train["image_path"] = data.train["image"].map(
         lambda i: str(config.dir_config.train_images_dir / i)
     )
     data.test["image_path"] = data.test["image"].map(
         lambda i: str(config.dir_config.test_images_dir / i)
     )
     return data, config
 def kurupical_fold(data: Data, config: Config) -> Tuple[Data, Config]:
     if config.env != EnvEnum.KAGGLE:
         data.train = data.train.merge(
             data.train_fold[["posting_id",
                              "fold"]].rename({"fold": "kurupical_fold"},
                                              axis=1),
             on="posting_id",
         )
     else:
         data.train["kurupical_fold"] = data.train["fold"]
     return data, config
    def title_preprocessed(data: Data, config: Config) -> Tuple[Data, Config]:
        def string_escape(s, encoding="utf-8"):
            return (s.encode(
                "latin1")  # To bytes, required by 'unicode-escape'
                    .decode("unicode-escape"
                            )  # Perform the actual octal-escaping decode
                    .encode("latin1")  # 1:1 mapping back to bytes
                    .decode(encoding))  # Decode original encoding

        data.train["title_preprocessed"] = data.train["title"].map(
            string_escape)
        data.test["title_preprocessed"] = data.test["title"].map(string_escape)
        return data, config
 def split_folds(data: Data, config: Config) -> Tuple[Data, Config]:
     folds = GroupKFold(n_splits=config.cv_config.n_splits)
     data.train["fold"] = -1
     for fold, (_, valid_idx) in enumerate(
             folds.split(data.train, None, data.train["label_group"])):
         data.train.loc[valid_idx, "fold"] = fold
     return data, config
Ejemplo n.º 6
0
 def image_phash_match_posting_ids(
     data: Data, config: Config
 ) -> Tuple[Data, Config]:
     _map = data.train.groupby("image_phash")["posting_id"].unique()
     data.train["image_phash_match_posting_ids"] = data.train["image_phash"].map(
         _map
     )
     return data, config
 def shuffle(data: Data, config: Config) -> Tuple[Data, Config]:
     data.train = data.train.sample(
         frac=1, random_state=config.seed).reset_index(drop=True)
     return data, config
 def title_num_str(data: Data, config: Config) -> Tuple[Data, Config]:
     data.train = title_number_to_str(data.train)
     data.test = title_number_to_str(data.test)
     return data, config
 def target(data: Data, config: Config) -> Tuple[Data, Config]:
     tmp = data.train.groupby("label_group")["posting_id"].unique()
     data.train["target"] = data.train["label_group"].map(tmp)
     return data, config
 def label_group_le(data: Data, config: Config) -> Tuple[Data, Config]:
     le = LabelEncoder()
     data.train["label_group_le"] = le.fit_transform(
         data.train["label_group"])
     return data, config