Esempio n. 1
0
    def generate_outer_feature(self):
        train_df, _ = read_data()
        test_df = read_data(test=True)
        all_df = pd.concat([train_df, test_df], ignore_index=True)

        users = all_df['user_id'].unique()

        docs = []
        for u in users:
            docs.append(all_df[all_df['user_id'] == u]['kiji_id'].values)

        vc = all_df['kiji_id'].value_counts()
        to_none_ids = vc[vc < 5].index

        def to_word(d):
            if d in to_none_ids:
                return 'None'
            return d

        if os.path.exists(self.fast_model_path):
            model = FastText.load(self.fast_model_path)
        else:
            docs = [[to_word(w) for w in doc] for doc in docs]
            with timer(logger,
                       format_str='create kiji_id fast_model' + ' {:.3f}[s]'):
                model = FastText(docs, workers=6, size=64)
            model.save(self.fast_model_path)

        z = self.df_outer['kiji_id_raw'].map(to_word).map(
            lambda x: model.wv[x])
        df = pd.DataFrame(np.array(z.values.tolist())).add_prefix('kiji_wv_')
        df[self.merge_key] = self.df_outer['kiji_id_raw']
        return df
Esempio n. 2
0
    def load_parsed_docs(self):

        if os.path.exists(self.cache_path):
            return joblib.load(self.cache_path)

        df = self.df_outer
        text_data = df[self.text_column]

        if self.text_column == 'keywords':
            text_data = text_data.map(keyword_text_to_list)
            text_data = [' '.join(d) for d in text_data]

        with timer(logger=logger,
                   format_str=self.text_column + ' parse context {:.3f}[s]'):
            title_docs = [safe_normalize(d) for d in text_data]
            title_docs = np.array(title_docs)
            idx_none = title_docs == None
            title_docs = title_docs[~idx_none]
            parser = DocumentParser()
            parsed = [parser.call(s) for s in title_docs]

            swem = SWEM(NikkeiFastText.load_model(), aggregation=self.agg)
            x = swem.transform(parsed)

        joblib.dump([x, idx_none], self.cache_path)
        return x, idx_none
Esempio n. 3
0
    def run_oof_train(self, X, y, default_params,
                      n_fold: Union[int, None] = None,
                      silent=False) -> ([List[PrePostProcessModel], np.ndarray]):
        """
        main training loop.

        Args:
            X:
                training array.
            y:
                target array
            default_params:
                model parameter using by default. pass to model constructor (not fit)
                If you change fit parameter like `eval_metric`, override get_fit_params_on_each_fold.
            n_fold:
                Number of fold to fit. If set None, learn for all folds.
                If set number, stop fit model reach to the value.
                    * if n_fold = None, run all folds
                    * if n_fold = 1, stop one fold.
                    * if n_fold > num_cv, run all folds
                    * if n_fold <= 0, no fold run, return empty list and zero vector out-of-fold
        Returns:
            list of fitted models and out-of-fold numpy array.
        """
        oof = np.zeros_like(y, dtype=np.float32)
        splits = self.get_fold_splitting(X, y)
        models = []

        for i, (idx_train, idx_valid) in enumerate(splits):
            if n_fold is not None and i >= max(0, n_fold):
                self.logger.info(f'Stop K-Fold at {i}')
                break

            self.logger.info('start k-fold: {}/{}'.format(i + 1, self.num_cv))

            X_i, y_i = X[idx_train], y[idx_train]
            X_valid, y_valid = X[idx_valid], y[idx_valid]

            with timer(self.logger, format_str='Fold: {}/{}'.format(i + 1, self.num_cv) + ' {:.1f}[s]'):
                output_i = None if not self.is_recording or silent else os.path.join(self.output_dir, f'fold_{i:02d}')
                clf = self._fit_model(X_i, y_i,
                                      default_params=default_params,
                                      validation_set=(X_valid, y_valid),
                                      indexes_set=(idx_train, idx_valid),
                                      output_dir=output_i)

            if self.is_regression_model:
                pred_i = clf.predict(X_valid).reshape(-1)
            else:
                pred_i = clf.predict(X_valid, prob=True)[:, 1]

            oof[idx_valid] = pred_i
            models.append(clf)

        return models, oof
Esempio n. 4
0
    def call(self, df_source, y=None, test=False):
        if test:
            self.load_molecule()

        out_df = pd.DataFrame()

        for atom in self.molecule.atoms:
            with timer(self.logger, format_str=f'{str(atom)} ' + '{:.3f}[s]'):
                out_df = pd.concat([out_df, atom.generate(df_source, y)], axis=1)

        if not test and self.is_recording:
            joblib.dump(self.molecule, self.molecule_path)

        return out_df
Esempio n. 5
0
    def load_model(cls):
        if cls.cache_model is not None:
            cls.logger.info('return from cache')
            return cls.cache_model

        if os.path.exists(cls.path_to_model):
            cls.logger.info('model already created. load from disk.')
            model = FastText.load(cls.path_to_model)
        else:
            with timer(cls.logger, format_str='create fasttext: ' + '{:.3f}'):
                model = cls.create_fast_model()
            model.save(cls.path_to_model)

        cls.cache_model = model
        return model
    def call(self, df_source, y=None, test=False):
        if not test:
            self.user_id_group = df_source['user_id'].values

        if 'user_id' in df_source.columns:
            del df_source['user_id']
        if test:
            return self._predict_trained_models(df_source)

        x_train, y_train = df_source.values, y

        # Note: float32 にしないと dtype が int になり, 予測確率を代入しても 0 のままになるので注意
        pred_train = None
        params = self.get_best_model_parameters(x_train, y_train)

        for i, ((x_i, y_i), (x_valid, y_valid), (_, idx_valid)) in enumerate(
                self.get_folds(
                    x_train, y_train,
                    groups=None)):  # [NOTE] Group KFold is not Supported yet
            self.logger.info('start k-fold: {}/{}'.format(i + 1, self.num_cv))

            with timer(self.logger,
                       format_str='Fold: {}/{}'.format(i + 1, self.num_cv) +
                       ' {:.1f}[s]'):
                clf = self.fit_model(x_i,
                                     y_i,
                                     params,
                                     x_valid=x_valid,
                                     y_valid=y_valid,
                                     cv=i)

                if self.is_regression_model:
                    pred_i = clf.predict(x_valid).reshape(-1)
                else:
                    pred_i = clf.predict(x_valid, prob=True)

            if pred_train is None:
                pred_train = np.zeros(shape=(len(y), pred_i.shape[1]),
                                      dtype=np.float)
            pred_train[idx_valid] = pred_i
            self.fitted_models.append(clf)

        self.finish_fit = True
        # self.after_kfold_fitting(df_source, y, pred_train)
        df_train = pd.DataFrame(pred_train, columns=[str(self)])
        return df_train
Esempio n. 7
0
    def generate_outer_feature(self):
        with timer(logger, format_str=self.text_column + ' load {:.3f}[s]'):
            x, idx_none = self.load_parsed_docs()

        if self.train:
            clf_pca = PCA(n_components=self.n_components)
            clf_pca.fit(x)
            self.clf_pca_ = clf_pca

        transformed = self.clf_pca_.transform(x)
        retval = np.zeros(shape=(len(self.df_outer), self.n_components))
        retval[~idx_none] = transformed
        out_df = pd.DataFrame(retval,
                              columns=[
                                  f'swem_{self.agg}_{self.text_column}_' +
                                  str(i) for i in range(self.n_components)
                              ])
        out_df[self.merge_key] = self.df_outer[[
            'kiji_id_raw'
        ]].rename(columns=dict(kiji_id_raw=self.merge_key))
        return out_df