def generate_outer_feature(self): train_df, _ = read_data() test_df = read_data(test=True) all_df = pd.concat([train_df, test_df], ignore_index=True) users = all_df['user_id'].unique() docs = [] for u in users: docs.append(all_df[all_df['user_id'] == u]['kiji_id'].values) vc = all_df['kiji_id'].value_counts() to_none_ids = vc[vc < 5].index def to_word(d): if d in to_none_ids: return 'None' return d if os.path.exists(self.fast_model_path): model = FastText.load(self.fast_model_path) else: docs = [[to_word(w) for w in doc] for doc in docs] with timer(logger, format_str='create kiji_id fast_model' + ' {:.3f}[s]'): model = FastText(docs, workers=6, size=64) model.save(self.fast_model_path) z = self.df_outer['kiji_id_raw'].map(to_word).map( lambda x: model.wv[x]) df = pd.DataFrame(np.array(z.values.tolist())).add_prefix('kiji_wv_') df[self.merge_key] = self.df_outer['kiji_id_raw'] return df
def load_parsed_docs(self): if os.path.exists(self.cache_path): return joblib.load(self.cache_path) df = self.df_outer text_data = df[self.text_column] if self.text_column == 'keywords': text_data = text_data.map(keyword_text_to_list) text_data = [' '.join(d) for d in text_data] with timer(logger=logger, format_str=self.text_column + ' parse context {:.3f}[s]'): title_docs = [safe_normalize(d) for d in text_data] title_docs = np.array(title_docs) idx_none = title_docs == None title_docs = title_docs[~idx_none] parser = DocumentParser() parsed = [parser.call(s) for s in title_docs] swem = SWEM(NikkeiFastText.load_model(), aggregation=self.agg) x = swem.transform(parsed) joblib.dump([x, idx_none], self.cache_path) return x, idx_none
def run_oof_train(self, X, y, default_params, n_fold: Union[int, None] = None, silent=False) -> ([List[PrePostProcessModel], np.ndarray]): """ main training loop. Args: X: training array. y: target array default_params: model parameter using by default. pass to model constructor (not fit) If you change fit parameter like `eval_metric`, override get_fit_params_on_each_fold. n_fold: Number of fold to fit. If set None, learn for all folds. If set number, stop fit model reach to the value. * if n_fold = None, run all folds * if n_fold = 1, stop one fold. * if n_fold > num_cv, run all folds * if n_fold <= 0, no fold run, return empty list and zero vector out-of-fold Returns: list of fitted models and out-of-fold numpy array. """ oof = np.zeros_like(y, dtype=np.float32) splits = self.get_fold_splitting(X, y) models = [] for i, (idx_train, idx_valid) in enumerate(splits): if n_fold is not None and i >= max(0, n_fold): self.logger.info(f'Stop K-Fold at {i}') break self.logger.info('start k-fold: {}/{}'.format(i + 1, self.num_cv)) X_i, y_i = X[idx_train], y[idx_train] X_valid, y_valid = X[idx_valid], y[idx_valid] with timer(self.logger, format_str='Fold: {}/{}'.format(i + 1, self.num_cv) + ' {:.1f}[s]'): output_i = None if not self.is_recording or silent else os.path.join(self.output_dir, f'fold_{i:02d}') clf = self._fit_model(X_i, y_i, default_params=default_params, validation_set=(X_valid, y_valid), indexes_set=(idx_train, idx_valid), output_dir=output_i) if self.is_regression_model: pred_i = clf.predict(X_valid).reshape(-1) else: pred_i = clf.predict(X_valid, prob=True)[:, 1] oof[idx_valid] = pred_i models.append(clf) return models, oof
def call(self, df_source, y=None, test=False): if test: self.load_molecule() out_df = pd.DataFrame() for atom in self.molecule.atoms: with timer(self.logger, format_str=f'{str(atom)} ' + '{:.3f}[s]'): out_df = pd.concat([out_df, atom.generate(df_source, y)], axis=1) if not test and self.is_recording: joblib.dump(self.molecule, self.molecule_path) return out_df
def load_model(cls): if cls.cache_model is not None: cls.logger.info('return from cache') return cls.cache_model if os.path.exists(cls.path_to_model): cls.logger.info('model already created. load from disk.') model = FastText.load(cls.path_to_model) else: with timer(cls.logger, format_str='create fasttext: ' + '{:.3f}'): model = cls.create_fast_model() model.save(cls.path_to_model) cls.cache_model = model return model
def call(self, df_source, y=None, test=False): if not test: self.user_id_group = df_source['user_id'].values if 'user_id' in df_source.columns: del df_source['user_id'] if test: return self._predict_trained_models(df_source) x_train, y_train = df_source.values, y # Note: float32 にしないと dtype が int になり, 予測確率を代入しても 0 のままになるので注意 pred_train = None params = self.get_best_model_parameters(x_train, y_train) for i, ((x_i, y_i), (x_valid, y_valid), (_, idx_valid)) in enumerate( self.get_folds( x_train, y_train, groups=None)): # [NOTE] Group KFold is not Supported yet self.logger.info('start k-fold: {}/{}'.format(i + 1, self.num_cv)) with timer(self.logger, format_str='Fold: {}/{}'.format(i + 1, self.num_cv) + ' {:.1f}[s]'): clf = self.fit_model(x_i, y_i, params, x_valid=x_valid, y_valid=y_valid, cv=i) if self.is_regression_model: pred_i = clf.predict(x_valid).reshape(-1) else: pred_i = clf.predict(x_valid, prob=True) if pred_train is None: pred_train = np.zeros(shape=(len(y), pred_i.shape[1]), dtype=np.float) pred_train[idx_valid] = pred_i self.fitted_models.append(clf) self.finish_fit = True # self.after_kfold_fitting(df_source, y, pred_train) df_train = pd.DataFrame(pred_train, columns=[str(self)]) return df_train
def generate_outer_feature(self): with timer(logger, format_str=self.text_column + ' load {:.3f}[s]'): x, idx_none = self.load_parsed_docs() if self.train: clf_pca = PCA(n_components=self.n_components) clf_pca.fit(x) self.clf_pca_ = clf_pca transformed = self.clf_pca_.transform(x) retval = np.zeros(shape=(len(self.df_outer), self.n_components)) retval[~idx_none] = transformed out_df = pd.DataFrame(retval, columns=[ f'swem_{self.agg}_{self.text_column}_' + str(i) for i in range(self.n_components) ]) out_df[self.merge_key] = self.df_outer[[ 'kiji_id_raw' ]].rename(columns=dict(kiji_id_raw=self.merge_key)) return out_df