def generate_outer_feature(self): train_df, _ = read_data() test_df = read_data(test=True) all_df = pd.concat([train_df, test_df], ignore_index=True) users = all_df['user_id'].unique() docs = [] for u in users: docs.append(all_df[all_df['user_id'] == u]['kiji_id'].values) vc = all_df['kiji_id'].value_counts() to_none_ids = vc[vc < 5].index def to_word(d): if d in to_none_ids: return 'None' return d if os.path.exists(self.fast_model_path): model = FastText.load(self.fast_model_path) else: docs = [[to_word(w) for w in doc] for doc in docs] with timer(logger, format_str='create kiji_id fast_model' + ' {:.3f}[s]'): model = FastText(docs, workers=6, size=64) model.save(self.fast_model_path) z = self.df_outer['kiji_id_raw'].map(to_word).map( lambda x: model.wv[x]) df = pd.DataFrame(np.array(z.values.tolist())).add_prefix('kiji_wv_') df[self.merge_key] = self.df_outer['kiji_id_raw'] return df
def call(self, df_input, y=None): df = df_input.groupby('user_id').agg(['mean', 'sum', 'max', 'min', 'std', 'nunique']).sort_values('user_id') df.columns = ['_'.join(x) for x in df.columns.to_flat_index()] df = df.reset_index() additional_atoms = [UedaAtom()] for atom in additional_atoms: if y is None: input_df = read_data(test=True) else: input_df, _ = read_data() df = pd.merge(df, atom.generate(input_df, y=None), on='user_id', how='left') return df.reset_index(drop=True)
import numpy as np import pandas as pd from vivid.featureset.molecules import MoleculeFeature, find_molecule from kaggle_days.dataset import read_data from kaggle_days.dataset import read_sample_submit from kaggle_days.models.classifiers import LGBMCls from kaggle_days.molecules import user_merge_molecule if __name__ == '__main__': m = find_molecule('benchmark')[0] raw_feature = MoleculeFeature( m, root_dir='/analysis/data/checkpoint/classification') entry_feature = MoleculeFeature(user_merge_molecule, parent=raw_feature) train_df, y = read_data() test_df = read_data(test=True) origin = np.sort(np.unique(y)) k_labels = np.arange(len(origin)) y2k = dict(zip(origin, k_labels)) y_labels = pd.Series(y).map(y2k).values clf = LGBMCls(name='lgbm_cls', parent=entry_feature) oof_df = clf.fit(train_df, y_labels) prob_predict = clf.predict(test_df).values predict = np.sum(prob_predict * origin, axis=1) sub_df = read_sample_submit() sub_df['age'] = predict sub_df.to_csv(os.path.join(clf.output_dir, 'predict.csv'), index=False)
formatter_class=ArgumentDefaultsHelpFormatter) parser.add_argument( '--molecule', default='benchmark', choices=[str(m.name) for m in MoleculeFactory.molecules], help='molecule name (see kaggle_days.molecules.py file)') parser.add_argument( '--simple', action='store_true', help= 'If True, run on small models (LightGBMx3 different objective function)' ) return parser.parse_args() if __name__ == '__main__': args = parse_argument() test_df = read_data(test=True) m = find_molecule(args.molecule)[0] pred_in_best = None for i in range(5): train_df, y = generate_next_step_dataset(y_pred=pred_in_best) composer = TrainComposer(molecule=m, simple=args.simple, suffix=f'psudo_{i}') score_df, pred_dict = composer.fit(train_df, y, test_df) best_model = score_df.sort_values('rmse').index[0] pred_in_best = pred_dict.get(best_model)