Exemple #1
0
    def main(self):
        t_start = datetime.now()
        logger.info(' {} / {} '.format(self.name, self.random_seed).center(62, '='))
        logger.info('Hyperparameters:\n{}'.format(pprint.pformat(self.params)))
        if os.path.isfile(os.path.join(self.output_dir, 'test.csv')):
            logger.info('Output already exists - skipping')
            return

        # Initialize the random number generator
        self.random_state = RandomState(self.random_seed)
        np.random.seed(int.from_bytes(self.random_state.bytes(4), byteorder=sys.byteorder))

        train_df = common.load_data('train')
        train_df['comment_text'] = train_df['comment_text'].apply(unidecode)
        test_df = common.load_data('test')
        test_df['comment_text'] = test_df['comment_text'].apply(unidecode)

        vectorizer = self.build_vectorizer(train_df, test_df)

        folds = common.stratified_kfold(train_df, random_seed=self.random_seed)
        for fold_num, train_ids, val_ids in folds:
            logger.info(f'Fold #{fold_num}')

            fold_train_df = train_df[train_df['id'].isin(train_ids)]
            fold_val_df = train_df[train_df['id'].isin(val_ids)]
            models = self.train(fold_num, vectorizer, fold_train_df, fold_val_df)

            logger.info('Generating the out-of-fold predictions')
            path = os.path.join(self.output_dir, f'fold{fold_num}_validation.csv')
            self.predict(models, vectorizer, fold_val_df, path)

            logger.info('Generating the test predictions')
            path = os.path.join(self.output_dir, f'fold{fold_num}_test.csv')
            self.predict(models, vectorizer, test_df, path)

        logger.info('Combining the out-of-fold predictions')
        df_parts = []
        for fold_num in range(1, 11):
            path = os.path.join(self.output_dir, f'fold{fold_num}_validation.csv')
            df_part = pd.read_csv(path, usecols=['id'] + common.LABELS)
            df_parts.append(df_part)
        train_pred = pd.concat(df_parts)
        path = os.path.join(self.output_dir, 'train.csv')
        train_pred.to_csv(path, index=False)

        logger.info('Averaging the test predictions')
        df_parts = []
        for fold_num in range(1, 11):
            path = os.path.join(self.output_dir, f'fold{fold_num}_test.csv')
            df_part = pd.read_csv(path, usecols=['id'] + common.LABELS)
            df_parts.append(df_part)
        test_pred = pd.concat(df_parts).groupby('id', as_index=False).mean()
        path = os.path.join(self.output_dir, 'test.csv')
        test_pred.to_csv(path, index=False)

        logger.info('Total elapsed time - {}'.format(datetime.now() - t_start))
Exemple #2
0
    def main(self):
        t_start = datetime.now()
        logger.info(' label_stacking / {} '.format(self.random_seed).center(
            62, '='))
        logger.info('Hyperparameters:\n{}'.format(pprint.pformat(self.params)))
        if os.path.isfile(os.path.join(self.output_dir, 'test.csv')):
            logger.info('Output already exists - skipping')
            return

        self.random_state = RandomState(self.random_seed)
        np.random.seed(
            int.from_bytes(self.random_state.bytes(4),
                           byteorder=sys.byteorder))

        test_df = common.load_data('test')
        train_df = common.load_data('train')

        folds = common.stratified_kfold(train_df, random_seed=self.random_seed)
        for fold_num, train_ids, val_ids in folds:
            logger.info(f'Fold #{fold_num}')

            logger.info(
                'Loading the training and validation data for the %s model',
                self.label)
            X_train = self.load_inputs(train_ids, 'train')
            X_val = self.load_inputs(val_ids, 'train')
            y_train = train_df.loc[train_df['id'].isin(train_ids)].sort_values(
                'id')
            y_train = y_train[self.label].values
            y_val = train_df[train_df['id'].isin(val_ids)].sort_values('id')
            y_val = y_val[self.label].values

            logger.info('Training the %s model', self.label)
            model = self.train(fold_num, self.label, X_train, y_train, X_val,
                               y_val)

            logger.info('Generating the out-of-fold predictions')
            y_model = self.predict(model, X_val)
            val_pred = pd.DataFrame({
                'id': sorted(list(val_ids)),
                self.label: y_model
            })
            path = os.path.join(self.output_dir,
                                f'fold{fold_num}_validation.csv')
            val_pred.to_csv(path, index=False)

            logger.info('Generating the test predictions')
            X_test = self.load_inputs(test_df['id'].values, 'test')
            y_model = self.predict(model, X_test)
            test_pred = pd.DataFrame({
                'id': test_df['id'],
                self.label: y_model
            })
            path = os.path.join(self.output_dir, f'fold{fold_num}_test.csv')
            test_pred.to_csv(path, index=False)

        logger.info('Averaging the test predictions')
        df_parts = []
        for fold_num in range(1, 11):
            path = os.path.join(self.output_dir, f'fold{fold_num}_test.csv')
            df_part = pd.read_csv(path, usecols=['id', self.label])
            df_parts.append(df_part)
        test_pred = pd.concat(df_parts).groupby('id', as_index=False).mean()
        path = os.path.join(self.output_dir, 'test.csv')
        test_pred.to_csv(path, index=False)

        logger.info('Elapsed time - {}'.format(datetime.now() - t_start))