def main(self): t_start = datetime.now() logger.info(' {} / {} '.format(self.name, self.random_seed).center(62, '=')) logger.info('Hyperparameters:\n{}'.format(pprint.pformat(self.params))) if os.path.isfile(os.path.join(self.output_dir, 'test.csv')): logger.info('Output already exists - skipping') return # Initialize the random number generator self.random_state = RandomState(self.random_seed) np.random.seed(int.from_bytes(self.random_state.bytes(4), byteorder=sys.byteorder)) train_df = common.load_data('train') train_df['comment_text'] = train_df['comment_text'].apply(unidecode) test_df = common.load_data('test') test_df['comment_text'] = test_df['comment_text'].apply(unidecode) vectorizer = self.build_vectorizer(train_df, test_df) folds = common.stratified_kfold(train_df, random_seed=self.random_seed) for fold_num, train_ids, val_ids in folds: logger.info(f'Fold #{fold_num}') fold_train_df = train_df[train_df['id'].isin(train_ids)] fold_val_df = train_df[train_df['id'].isin(val_ids)] models = self.train(fold_num, vectorizer, fold_train_df, fold_val_df) logger.info('Generating the out-of-fold predictions') path = os.path.join(self.output_dir, f'fold{fold_num}_validation.csv') self.predict(models, vectorizer, fold_val_df, path) logger.info('Generating the test predictions') path = os.path.join(self.output_dir, f'fold{fold_num}_test.csv') self.predict(models, vectorizer, test_df, path) logger.info('Combining the out-of-fold predictions') df_parts = [] for fold_num in range(1, 11): path = os.path.join(self.output_dir, f'fold{fold_num}_validation.csv') df_part = pd.read_csv(path, usecols=['id'] + common.LABELS) df_parts.append(df_part) train_pred = pd.concat(df_parts) path = os.path.join(self.output_dir, 'train.csv') train_pred.to_csv(path, index=False) logger.info('Averaging the test predictions') df_parts = [] for fold_num in range(1, 11): path = os.path.join(self.output_dir, f'fold{fold_num}_test.csv') df_part = pd.read_csv(path, usecols=['id'] + common.LABELS) df_parts.append(df_part) test_pred = pd.concat(df_parts).groupby('id', as_index=False).mean() path = os.path.join(self.output_dir, 'test.csv') test_pred.to_csv(path, index=False) logger.info('Total elapsed time - {}'.format(datetime.now() - t_start))
def main(self): t_start = datetime.now() logger.info(' label_stacking / {} '.format(self.random_seed).center( 62, '=')) logger.info('Hyperparameters:\n{}'.format(pprint.pformat(self.params))) if os.path.isfile(os.path.join(self.output_dir, 'test.csv')): logger.info('Output already exists - skipping') return self.random_state = RandomState(self.random_seed) np.random.seed( int.from_bytes(self.random_state.bytes(4), byteorder=sys.byteorder)) test_df = common.load_data('test') train_df = common.load_data('train') folds = common.stratified_kfold(train_df, random_seed=self.random_seed) for fold_num, train_ids, val_ids in folds: logger.info(f'Fold #{fold_num}') logger.info( 'Loading the training and validation data for the %s model', self.label) X_train = self.load_inputs(train_ids, 'train') X_val = self.load_inputs(val_ids, 'train') y_train = train_df.loc[train_df['id'].isin(train_ids)].sort_values( 'id') y_train = y_train[self.label].values y_val = train_df[train_df['id'].isin(val_ids)].sort_values('id') y_val = y_val[self.label].values logger.info('Training the %s model', self.label) model = self.train(fold_num, self.label, X_train, y_train, X_val, y_val) logger.info('Generating the out-of-fold predictions') y_model = self.predict(model, X_val) val_pred = pd.DataFrame({ 'id': sorted(list(val_ids)), self.label: y_model }) path = os.path.join(self.output_dir, f'fold{fold_num}_validation.csv') val_pred.to_csv(path, index=False) logger.info('Generating the test predictions') X_test = self.load_inputs(test_df['id'].values, 'test') y_model = self.predict(model, X_test) test_pred = pd.DataFrame({ 'id': test_df['id'], self.label: y_model }) path = os.path.join(self.output_dir, f'fold{fold_num}_test.csv') test_pred.to_csv(path, index=False) logger.info('Averaging the test predictions') df_parts = [] for fold_num in range(1, 11): path = os.path.join(self.output_dir, f'fold{fold_num}_test.csv') df_part = pd.read_csv(path, usecols=['id', self.label]) df_parts.append(df_part) test_pred = pd.concat(df_parts).groupby('id', as_index=False).mean() path = os.path.join(self.output_dir, 'test.csv') test_pred.to_csv(path, index=False) logger.info('Elapsed time - {}'.format(datetime.now() - t_start))