Beispiel #1
0
 def __init__(self, root_dir='data', download=False, split_scheme='official'):
     # set variables
     self._dataset_name = 'yelp'
     self._version = '1.0'
     if split_scheme=='official':
         split_scheme = 'time'
     self._split_scheme = split_scheme
     self._y_type = 'long'
     self._y_size = 1
     self._n_classes = 5
     # path
     self._data_dir = self.initialize_data_dir(root_dir, download)
     # Load data
     data_df = pd.read_csv(os.path.join(self.data_dir, 'reviews.csv'),
             dtype={'review_id': str, 'user_id':str, 'business_id':str, 'stars':int, 'useful':int, 'funny':int,
                    'cool':int, 'text':str, 'date':str, 'year':int, 'city':str, 'state':str, 'categories':str},
             keep_default_na=False, na_values=[], quoting=csv.QUOTE_NONNUMERIC)
     split_df = pd.read_csv(os.path.join(self.data_dir, 'splits', f'{self.split_scheme}.csv'))
     is_in_dataset = split_df['split']!=NOT_IN_DATASET
     split_df = split_df[is_in_dataset]
     data_df = data_df[is_in_dataset]
     # Get arrays
     self._split_array = split_df['split'].values
     self._input_array = list(data_df['text'])
     # Get metadata
     self._metadata_fields, self._metadata_array, self._metadata_map = self.load_metadata(data_df, self.split_array)
     # Get y from metadata
     self._y_array = getattr(self.metadata_array[:,self.metadata_fields.index('y')], self._y_type)()
     # Set split info
     self.initialize_split_dicts()
     # eval
     self.initialize_eval_grouper()
     self._metric = Accuracy()
     super().__init__(root_dir, download, split_scheme)
 def __init__(self,
              root_dir='data',
              download=False,
              split_scheme='official'):
     # set variables
     self._dataset_name = 'amazon'
     self._version = '1.0'
     self._download_url = 'https://worksheets.codalab.org/rest/bundles/0x60237058e01749cda7b0701c2bd01420/contents/blob/'
     self._compressed_size = 4_066_541_568
     # the official split is the user split
     if split_scheme == 'official':
         split_scheme = 'user'
     self._split_scheme = split_scheme
     self._y_type = 'long'
     self._y_size = 1
     self._n_classes = 5
     # path
     self._data_dir = self.initialize_data_dir(root_dir, download)
     # Load data
     data_df = pd.read_csv(os.path.join(self.data_dir, 'reviews.csv'),
                           dtype={
                               'reviewerID': str,
                               'asin': str,
                               'reviewTime': str,
                               'unixReviewTime': int,
                               'reviewText': str,
                               'summary': str,
                               'verified': bool,
                               'category': str,
                               'reviewYear': int
                           },
                           keep_default_na=False,
                           na_values=[],
                           quoting=csv.QUOTE_NONNUMERIC)
     split_df = pd.read_csv(
         os.path.join(self.data_dir, 'splits', f'{self.split_scheme}.csv'))
     is_in_dataset = split_df['split'] != NOT_IN_DATASET
     split_df = split_df[is_in_dataset]
     data_df = data_df[is_in_dataset]
     # Get arrays
     self._split_array = split_df['split'].values
     self._input_array = list(data_df['reviewText'])
     # Get metadata
     self._metadata_fields, self._metadata_array, self._metadata_map = self.load_metadata(
         data_df, self.split_array)
     # Get y from metadata
     self._y_array = getattr(
         self.metadata_array[:, self.metadata_fields.index('y')],
         self._y_type)()
     # Set split info
     self.initialize_split_dicts()
     # eval
     self.initialize_eval_grouper()
     self._metric = Accuracy()
     super().__init__(root_dir, download, split_scheme)
Beispiel #3
0
    def eval(self, y_pred, y_true, metadata, prediction_fn=None):
        """
        Computes all evaluation metrics.
        Args:
            - y_pred (Tensor): Predictions from a model. By default, they are predicted labels (LongTensor).
                               But they can also be other model outputs such that prediction_fn(y_pred)
                               are predicted labels.
            - y_true (LongTensor): Ground-truth labels
            - metadata (Tensor): Metadata
            - prediction_fn (function): A function that turns y_pred into predicted labels
        Output:
            - results (dictionary): Dictionary of evaluation metrics
            - results_str (str): String summarizing the evaluation metrics
        """
        metrics = [
            Accuracy(prediction_fn=prediction_fn),
            Recall(prediction_fn=prediction_fn, average='macro'),
            F1(prediction_fn=prediction_fn, average='macro'),
        ]

        results = {}

        for i in range(len(metrics)):
            results.update({
                **metrics[i].compute(y_pred, y_true),
            })

        results_str = (
            f"Average acc: {results[metrics[0].agg_metric_field]:.3f}\n"
            f"Recall macro: {results[metrics[1].agg_metric_field]:.3f}\n"
            f"F1 macro: {results[metrics[2].agg_metric_field]:.3f}\n")

        return results, results_str
Beispiel #4
0
    def eval(self, y_pred, y_true, metadata, prediction_fn=None):
        """
        Computes all evaluation metrics.
        Args:
            - y_pred (Tensor): Predictions from a model. By default, they are predicted labels (LongTensor).
                               But they can also be other model outputs such that prediction_fn(y_pred)
                               are predicted labels.
            - y_true (LongTensor): Ground-truth labels
            - metadata (Tensor): Metadata
            - prediction_fn (function): A function that turns y_pred into predicted labels 
        Output:
            - results (dictionary): Dictionary of evaluation metrics
            - results_str (str): String summarizing the evaluation metrics
        """
        metric = Accuracy(prediction_fn=prediction_fn)
        results = {
            **metric.compute(y_pred, y_true),
        }
        results_str = f"Average {metric.name}: {results[metric.agg_metric_field]:.3f}\n"
        # Each eval_grouper is over label + a single identity
        # We only want to keep the groups where the identity is positive
        # The groups are:
        #   Group 0: identity = 0, y = 0
        #   Group 1: identity = 1, y = 0
        #   Group 2: identity = 0, y = 1
        #   Group 3: identity = 1, y = 1
        # so this means we want only groups 1 and 3.
        worst_group_metric = None
        for identity_var, eval_grouper in zip(self._identity_vars, self._eval_groupers):
            g = eval_grouper.metadata_to_group(metadata)
            group_results = {
                **metric.compute_group_wise(y_pred, y_true, g, eval_grouper.n_groups)
            }
            results_str += f"  {identity_var:20s}"
            for group_idx in range(eval_grouper.n_groups):
                group_str = eval_grouper.group_field_str(group_idx)
                if f'{identity_var}:1' in group_str:
                    group_metric = group_results[metric.group_metric_field(group_idx)]
                    group_counts = group_results[metric.group_count_field(group_idx)]
                    results[f'{metric.name}_{group_str}'] = group_metric
                    results[f'count_{group_str}'] = group_counts
                    if f'y:0' in group_str:
                        label_str = 'non_toxic'
                    else:
                        label_str = 'toxic'
                    results_str += (
                        f"   {metric.name} on {label_str}: {group_metric:.3f}"
                        f" (n = {results[f'count_{group_str}']:6.0f}) "
                    )
                    if worst_group_metric is None:
                        worst_group_metric = group_metric
                    else:
                        worst_group_metric = metric.worst(
                            [worst_group_metric, group_metric])
            results_str += f"\n"
        results[f'{metric.worst_group_metric_field}'] = worst_group_metric
        results_str += f"Worst-group {metric.name}: {worst_group_metric:.3f}\n"

        return results, results_str
Beispiel #5
0
    def eval(self,
             y_pred,
             y_true,
             metadata,
             prediction_fn=multiclass_logits_to_pred,
             score_fn=binary_logits_to_score):
        """
        Computes all evaluation metrics.
        Args:
            - y_pred (Tensor): Predictions from a model. By default, they are multi-class logits (FloatTensor).
                               But they can also be other model outputs such that prediction_fn(y_pred)
                               are predicted labels and score_fn(y_pred) are confidence scores.
            - y_true (LongTensor): Ground-truth labels
            - metadata (Tensor): Metadata
            - prediction_fn (function): A function that turns y_pred into predicted labels
        Output:
            - results (dictionary): Dictionary of evaluation metrics
            - results_str (str): String summarizing the evaluation metrics
        """
        """Evaluate the precision achieved overall and across groups for a given global recall"""
        g = self._eval_grouper.metadata_to_group(metadata)

        y_scores = score_fn(y_pred)
        threshold_60 = threshold_at_recall(y_scores, y_true, global_recall=60)

        accuracy_metric = Accuracy(prediction_fn=prediction_fn)
        PAR_metric = PrecisionAtRecall(threshold_60, score_fn=score_fn)

        results = accuracy_metric.compute(y_pred, y_true)
        results.update(PAR_metric.compute(y_pred, y_true))
        results.update(
            accuracy_metric.compute_group_wise(y_pred, y_true, g,
                                               self._eval_grouper.n_groups))
        results.update(
            PAR_metric.compute_group_wise(y_pred, y_true, g,
                                          self._eval_grouper.n_groups))

        results_str = (
            f"Average {PAR_metric.name}:  {results[PAR_metric.agg_metric_field]:.3f}\n"
            f"Average {accuracy_metric.name}:  {results[accuracy_metric.agg_metric_field]:.3f}\n"
        )

        return results, results_str
Beispiel #6
0
    def eval(self, y_pred, y_true, metadata, prediction_fn=None):
        """
        Computes all evaluation metrics.
        Args:
            - y_pred (Tensor): Predictions from a model. By default, they are predicted labels (LongTensor).
                               But they can also be other model outputs such that prediction_fn(y_pred)
                               are predicted labels.
            - y_true (LongTensor): Ground-truth labels
            - metadata (Tensor): Metadata
            - prediction_fn (function): A function that turns y_pred into predicted labels
        Output:
            - results (dictionary): Dictionary of evaluation metrics
            - results_str (str): String summarizing the evaluation metrics
        """
        metric = Accuracy(prediction_fn=prediction_fn)
        # Overall evaluation + evaluate by year
        all_results, all_results_str = self.standard_group_eval(
            metric, self._eval_groupers['year'], y_pred, y_true, metadata)
        # Evaluate by region and ignore the "Other" region
        region_grouper = self._eval_groupers['region']
        region_results = metric.compute_group_wise(
            y_pred, y_true, region_grouper.metadata_to_group(metadata),
            region_grouper.n_groups)
        all_results[f'{metric.name}_worst_year'] = all_results.pop(
            metric.worst_group_metric_field)
        region_metric_list = []
        for group_idx in range(region_grouper.n_groups):
            group_str = region_grouper.group_field_str(group_idx)
            group_metric = region_results[metric.group_metric_field(group_idx)]
            group_counts = region_results[metric.group_count_field(group_idx)]
            all_results[f'{metric.name}_{group_str}'] = group_metric
            all_results[f'count_{group_str}'] = group_counts
            if region_results[metric.group_count_field(
                    group_idx)] == 0 or "Other" in group_str:
                continue
            all_results_str += (
                f'  {region_grouper.group_str(group_idx)}  '
                f"[n = {region_results[metric.group_count_field(group_idx)]:6.0f}]:\t"
                f"{metric.name} = {region_results[metric.group_metric_field(group_idx)]:5.3f}\n"
            )
            region_metric_list.append(
                region_results[metric.group_metric_field(group_idx)])
        all_results[f'{metric.name}_worst_region'] = metric.worst(
            region_metric_list)
        all_results_str += f"Worst-group {metric.name}: {all_results[f'{metric.name}_worst_region']:.3f}\n"

        return all_results, all_results_str
    def __init__(self,
                 root_dir='data',
                 download=False,
                 split_scheme='official'):
        self._dataset_name = 'waterbirds'
        self._version = '1.0'
        self._download_url = 'https://worksheets.codalab.org/rest/bundles/0x505056d5cdea4e4eaa0e242cbfe2daa4/contents/blob/'
        self._data_dir = self.initialize_data_dir(root_dir, download)

        if not os.path.exists(self.data_dir):
            raise ValueError(
                f'{self.data_dir} does not exist yet. Please generate the dataset first.'
            )

        # Read in metadata
        # Note: metadata_df is one-indexed.
        metadata_df = pd.read_csv(os.path.join(self.data_dir, 'metadata.csv'))

        # Get the y values
        self._y_array = torch.LongTensor(metadata_df['y'].values)
        self._y_size = 1
        self._n_classes = 2

        self._metadata_array = torch.stack(
            (torch.LongTensor(metadata_df['place'].values), self._y_array),
            dim=1)
        self._metadata_fields = ['background', 'y']
        self._metadata_map = {
            'background': [' land', 'water'],  # Padding for str formatting
            'y': [' landbird', 'waterbird']
        }

        # Extract filenames
        self._input_array = metadata_df['img_filename'].values
        self._original_resolution = (224, 224)

        # Extract splits
        self._split_scheme = split_scheme
        if self._split_scheme != 'official':
            raise ValueError(
                f'Split scheme {self._split_scheme} not recognized')
        self._split_array = metadata_df['split'].values

        self._eval_grouper = CombinatorialGrouper(
            dataset=self, groupby_fields=(['background', 'y']))
        self._metric = Accuracy()

        super().__init__(root_dir, download, split_scheme)
Beispiel #8
0
 def eval(self, y_pred, y_true, metadata, prediction_fn=None):
     """
     Computes all evaluation metrics.
     Args:
         - y_pred (Tensor): Predictions from a model. By default, they are predicted labels (LongTensor).
                            But they can also be other model outputs such that prediction_fn(y_pred)
                            are predicted labels.
         - y_true (LongTensor): Ground-truth labels
         - metadata (Tensor): Metadata
         - prediction_fn (function): A function that turns y_pred into predicted labels 
     Output:
         - results (dictionary): Dictionary of evaluation metrics
         - results_str (str): String summarizing the evaluation metrics
     """
     metric = Accuracy(prediction_fn=prediction_fn)
     return self.standard_group_eval(metric, self._eval_grouper, y_pred,
                                     y_true, metadata)
Beispiel #9
0
 def eval(self, y_pred, y_true, metadata, prediction_fn=None):
     """
     Computes all evaluation metrics.
     Args:
         - y_pred (Tensor): Predictions from a model. By default, they are predicted labels (LongTensor).
                            But they can also be other model outputs such that prediction_fn(y_pred)
                            are predicted labels.
         - y_true (LongTensor): Ground-truth labels
         - metadata (Tensor): Metadata
         - prediction_fn (function): A function that turns y_pred into predicted labels 
     Output:
         - results (dictionary): Dictionary of evaluation metrics
         - results_str (str): String summarizing the evaluation metrics
     """
     metric = Accuracy(prediction_fn=prediction_fn)
     if self.split_scheme=='user':
         # first compute groupwise accuracies
         g = self._eval_grouper.metadata_to_group(metadata)
         results = {
             **metric.compute(y_pred, y_true),
             **metric.compute_group_wise(y_pred, y_true, g, self._eval_grouper.n_groups)
         }
         accs = []
         for group_idx in range(self._eval_grouper.n_groups):
             group_str = self._eval_grouper.group_field_str(group_idx)
             group_metric = results.pop(metric.group_metric_field(group_idx))
             group_counts = results.pop(metric.group_count_field(group_idx))
             results[f'{metric.name}_{group_str}'] = group_metric
             results[f'count_{group_str}'] = group_counts
             if group_counts>0:
                 accs.append(group_metric)
         accs = np.array(accs)
         results['10th_percentile_acc'] = np.percentile(accs, 10)
         results[f'{metric.worst_group_metric_field}'] = metric.worst(accs)
         results_str = (
             f"Average {metric.name}: {results[metric.agg_metric_field]:.3f}\n"
             f"10th percentile {metric.name}: {results['10th_percentile_acc']:.3f}\n"
             f"Worst-group {metric.name}: {results[metric.worst_group_metric_field]:.3f}\n"
         )
         return results, results_str
     else:
         return self.standard_group_eval(
             metric,
             self._eval_grouper,
             y_pred, y_true, metadata)
Beispiel #10
0
from wilds.common.metrics.all_metrics import Accuracy, MultiTaskAccuracy, MSE, multiclass_logits_to_pred, binary_logits_to_pred

losses = {
    'cross_entropy':
    ElementwiseLoss(loss_fn=nn.CrossEntropyLoss(reduction='none')),
    'lm_cross_entropy':
    MultiTaskLoss(loss_fn=nn.CrossEntropyLoss(reduction='none')),
    'mse':
    MSE(name='loss'),
    'multitask_bce':
    MultiTaskLoss(loss_fn=nn.BCEWithLogitsLoss(reduction='none')),
}

algo_log_metrics = {
    'accuracy':
    Accuracy(prediction_fn=multiclass_logits_to_pred),
    'mse':
    MSE(),
    'multitask_accuracy':
    MultiTaskAccuracy(prediction_fn=multiclass_logits_to_pred),
    'multitask_binary_accuracy':
    MultiTaskAccuracy(prediction_fn=binary_logits_to_pred),
    None:
    None,
}

process_outputs_functions = {
    'binary_logits_to_pred': binary_logits_to_pred,
    'multiclass_logits_to_pred': multiclass_logits_to_pred,
    None: None,
}
class YelpDataset(WILDSDataset):
    """
    Yelp dataset.
    This is a modified version of the Yelp Open Dataset
    This dataset is not part of the official WILDS benchmark.
    We provide it for convenience and to reproduce observations discussed in the WILDS paper.

    Supported `split_scheme`:
        'official': official split, which is equivalent to 'time'
        'time': shifts from reviews written before 2013 to reviews written after 2013
        'user': shifts to unseen reviewers
        'time_baseline': oracle baseline splits for time shifts

    Input (x):
        Review text of maximum token length of 512.

    Label (y):
        y is the star rating (0,1,2,3,4 corresponding to 1-5 stars)

    Metadata:
        user: reviewer ID
        year: year in which the review was written
        business: business ID
        city: city of the business
        state: state of the business

    Website:
        https://www.yelp.com/dataset

    License:
        Because of the Dataset License provided by Yelp, we are unable to redistribute the data.
        Please download the data through the website (https://www.yelp.com/dataset/download) by
        agreeing to the Dataset License.
    """
    def __init__(self,
                 root_dir='data',
                 download=False,
                 split_scheme='official'):
        # set variables
        self._dataset_name = 'yelp'
        self._version = '1.0'
        if split_scheme == 'official':
            split_scheme = 'time'
        self._split_scheme = split_scheme
        self._y_type = 'long'
        self._y_size = 1
        self._n_classes = 5
        # path
        self._data_dir = self.initialize_data_dir(root_dir, download)
        # Load data
        data_df = pd.read_csv(os.path.join(self.data_dir, 'reviews.csv'),
                              dtype={
                                  'review_id': str,
                                  'user_id': str,
                                  'business_id': str,
                                  'stars': int,
                                  'useful': int,
                                  'funny': int,
                                  'cool': int,
                                  'text': str,
                                  'date': str,
                                  'year': int,
                                  'city': str,
                                  'state': str,
                                  'categories': str
                              },
                              keep_default_na=False,
                              na_values=[],
                              quoting=csv.QUOTE_NONNUMERIC)
        split_df = pd.read_csv(
            os.path.join(self.data_dir, 'splits', f'{self.split_scheme}.csv'))
        is_in_dataset = split_df['split'] != NOT_IN_DATASET
        split_df = split_df[is_in_dataset]
        data_df = data_df[is_in_dataset]
        # Get arrays
        self._split_array = split_df['split'].values
        self._input_array = list(data_df['text'])
        # Get metadata
        self._metadata_fields, self._metadata_array, self._metadata_map = self.load_metadata(
            data_df, self.split_array)
        # Get y from metadata
        self._y_array = getattr(
            self.metadata_array[:, self.metadata_fields.index('y')],
            self._y_type)()
        # Set split info
        self.initialize_split_dicts()
        # eval
        self.initialize_eval_grouper()
        self._metric = Accuracy()
        super().__init__(root_dir, download, split_scheme)

    def get_input(self, idx):
        return self._input_array[idx]

    def eval(self, y_pred, y_true, metadata):
        if self.split_scheme == 'user':
            # first compute groupwise accuracies
            g = self._eval_grouper.metadata_to_group(metadata)
            results = {
                **self._metric.compute(y_pred, y_true),
                **self._metric.compute_group_wise(y_pred, y_true, g, self._eval_grouper.n_groups)
            }
            accs = []
            for group_idx in range(self._eval_grouper.n_groups):
                group_str = self._eval_grouper.group_field_str(group_idx)
                group_metric = results.pop(
                    self._metric.group_metric_field(group_idx))
                group_counts = results.pop(
                    self._metric.group_count_field(group_idx))
                results[f'{self._metric.name}_{group_str}'] = group_metric
                results[f'count_{group_str}'] = group_counts
                if group_counts > 0:
                    accs.append(group_metric)
            accs = np.array(accs)
            results['10th_percentile_acc'] = np.percentile(accs, 10)
            results[
                f'{self._metric.worst_group_metric_field}'] = self._metric.worst(
                    accs)
            results_str = (
                f"Average {self._metric.name}: {results[self._metric.agg_metric_field]:.3f}\n"
                f"10th percentile {self._metric.name}: {results['10th_percentile_acc']:.3f}\n"
                f"Worst-group {self._metric.name}: {results[self._metric.worst_group_metric_field]:.3f}\n"
            )
            return results, results_str
        else:
            return self.standard_group_eval(self._metric, self._eval_grouper,
                                            y_pred, y_true, metadata)

    def initialize_split_dicts(self):
        if self.split_scheme in ('user', 'time'):
            self._split_dict = {
                'train': 0,
                'val': 1,
                'id_val': 2,
                'test': 3,
                'id_test': 4
            }
            self._split_names = {
                'train': 'Train',
                'val': 'Validation (OOD)',
                'id_val': 'Validation (ID)',
                'test': 'Test (OOD)',
                'id_test': 'Test (ID)'
            }
        elif self.split_scheme in ('time_baseline', ):
            # use defaults
            pass
        else:
            raise ValueError(
                f'Split scheme {self.split_scheme} not recognized')

    def load_metadata(self, data_df, split_array):
        # Get metadata
        columns = [
            'user_id',
            'business_id',
            'year',
            'city',
            'state',
            'stars',
        ]
        metadata_fields = ['user', 'business', 'year', 'city', 'state', 'y']
        metadata_df = data_df[columns].copy()
        metadata_df.columns = metadata_fields
        sort_idx = np.argsort(split_array)
        ordered_maps = {}
        for field in ['user', 'business', 'city', 'state']:
            # map to IDs in the order of split values
            ordered_maps[field] = pd.unique(metadata_df.iloc[sort_idx][field])
        ordered_maps['y'] = range(1, 6)
        ordered_maps['year'] = range(metadata_df['year'].min(),
                                     metadata_df['year'].max() + 1)
        metadata_map, metadata = map_to_id_array(metadata_df, ordered_maps)
        return metadata_fields, torch.from_numpy(
            metadata.astype('long')), metadata_map

    def initialize_eval_grouper(self):
        if self.split_scheme == 'user':
            self._eval_grouper = CombinatorialGrouper(dataset=self,
                                                      groupby_fields=['user'])
        elif self.split_scheme in ('time', 'time_baseline'):
            self._eval_grouper = CombinatorialGrouper(dataset=self,
                                                      groupby_fields=['year'])
        else:
            raise ValueError(
                f'Split scheme {self.split_scheme} not recognized')
    def __init__(self,
                 root_dir='data',
                 download=False,
                 split_scheme='official'):
        self._dataset_name = 'celebA'
        self._version = '1.0'
        self._download_url = ''
        self._data_dir = self.initialize_data_dir(root_dir, download)
        target_name = 'Blond_Hair'
        confounder_names = ['Male']

        # Read in attributes
        attrs_df = pd.read_csv(
            os.path.join(self.data_dir, 'list_attr_celeba.csv'))

        # Split out filenames and attribute names
        # Note: idx and filenames are off by one.
        self._input_array = attrs_df['image_id'].values
        self._original_resolution = (178, 218)
        attrs_df = attrs_df.drop(labels='image_id', axis='columns')
        attr_names = attrs_df.columns.copy()

        def attr_idx(attr_name):
            return attr_names.get_loc(attr_name)

        # Then cast attributes to numpy array and set them to 0 and 1
        # (originally, they're -1 and 1)
        attrs_df = attrs_df.values
        attrs_df[attrs_df == -1] = 0

        # Get the y values
        target_idx = attr_idx(target_name)
        self._y_array = torch.LongTensor(attrs_df[:, target_idx])
        self._y_size = 1
        self._n_classes = 2

        # Get metadata
        confounder_idx = [attr_idx(a) for a in confounder_names]
        confounders = attrs_df[:, confounder_idx]

        self._metadata_array = torch.cat(
            (torch.LongTensor(confounders), self._y_array.reshape((-1, 1))),
            dim=1)
        confounder_names = [s.lower() for s in confounder_names]
        self._metadata_fields = confounder_names + ['y']
        self._metadata_map = {
            'y': ['not blond', '    blond']  # Padding for str formatting
        }

        self._eval_grouper = CombinatorialGrouper(
            dataset=self, groupby_fields=(confounder_names + ['y']))
        self._metric = Accuracy()

        # Extract splits
        self._split_scheme = split_scheme
        if self._split_scheme != 'official':
            raise ValueError(
                f'Split scheme {self._split_scheme} not recognized')
        split_df = pd.read_csv(
            os.path.join(self.data_dir, 'list_eval_partition.csv'))
        self._split_array = split_df['partition'].values

        super().__init__(root_dir, download, split_scheme)
class FMoWDataset(WILDSDataset):
    """
    The Functional Map of the World land use / building classification dataset.
    This is a processed version of the Functional Map of the World dataset originally sourced from https://github.com/fMoW/dataset.

    Support `split_scheme`
        'official': official split, which is equivalent to 'time_after_2016'
        `time_after_{YEAR}` for YEAR between 2002--2018

    Input (x):
        224 x 224 x 3 RGB satellite image.

    Label (y):
        y is one of 62 land use / building classes

    Metadata:
        each image is annotated with a location coordinate, timestamp, country code. This dataset computes region as a derivative of country code.

    Website: https://github.com/fMoW/dataset

    Original publication:
    @inproceedings{fmow2018,
      title={Functional Map of the World},
      author={Christie, Gordon and Fendley, Neil and Wilson, James and Mukherjee, Ryan},
      booktitle={CVPR},
      year={2018}
    }

    License:
        Distributed under the FMoW Challenge Public License.
        https://github.com/fMoW/dataset/blob/master/LICENSE

    """
    _dataset_name = 'fmow'
    _download_url = 'https://worksheets.codalab.org/rest/bundles/0xc59ea8261dfe4d2baa3820866e33d781/contents/blob/'
    _version = '1.0'

    def __init__(self,
                 root_dir='data',
                 download=False,
                 split_scheme='official',
                 oracle_training_set=False,
                 seed=111,
                 use_ood_val=False):
        self._compressed_size = 70_000_000_000
        self._data_dir = self.initialize_data_dir(root_dir, download)

        self._split_dict = {
            'train': 0,
            'id_val': 1,
            'id_test': 2,
            'val': 3,
            'test': 4
        }
        self._split_names = {
            'train': 'Train',
            'id_val': 'ID Val',
            'id_test': 'ID Test',
            'val': 'OOD Val',
            'test': 'OOD Test'
        }
        if split_scheme == 'official':
            split_scheme = 'time_after_2016'
        self._split_scheme = split_scheme
        self.oracle_training_set = oracle_training_set

        self.root = Path(self._data_dir)
        self.seed = int(seed)
        self._original_resolution = (224, 224)

        self.category_to_idx = {cat: i for i, cat in enumerate(categories)}

        self.metadata = pd.read_csv(self.root / 'rgb_metadata.csv')
        country_codes_df = pd.read_csv(self.root / 'country_code_mapping.csv')
        countrycode_to_region = {
            k: v
            for k, v in zip(country_codes_df['alpha-3'],
                            country_codes_df['region'])
        }
        regions = [
            countrycode_to_region.get(code, 'Other')
            for code in self.metadata['country_code'].to_list()
        ]
        self.metadata['region'] = regions
        all_countries = self.metadata['country_code']

        self.num_chunks = 101
        self.chunk_size = len(self.metadata) // (self.num_chunks - 1)

        if self._split_scheme.startswith('time_after'):
            year = int(self._split_scheme.split('_')[2])
            year_dt = datetime.datetime(year, 1, 1, tzinfo=pytz.UTC)
            self.test_ood_mask = np.asarray(
                pd.to_datetime(self.metadata['timestamp']) >= year_dt)
            # use 3 years of the training set as validation
            year_minus_3_dt = datetime.datetime(year - 3,
                                                1,
                                                1,
                                                tzinfo=pytz.UTC)
            self.val_ood_mask = np.asarray(
                pd.to_datetime(self.metadata['timestamp']) >= year_minus_3_dt
            ) & ~self.test_ood_mask
            self.ood_mask = self.test_ood_mask | self.val_ood_mask
        else:
            raise ValueError(
                f"Not supported: self._split_scheme = {self._split_scheme}")

        self._split_array = -1 * np.ones(len(self.metadata))
        for split in self._split_dict.keys():
            idxs = np.arange(len(self.metadata))
            if split == 'test':
                test_mask = np.asarray(self.metadata['split'] == 'test')
                idxs = idxs[self.test_ood_mask & test_mask]
            elif split == 'val':
                val_mask = np.asarray(self.metadata['split'] == 'val')
                idxs = idxs[self.val_ood_mask & val_mask]
            elif split == 'id_test':
                test_mask = np.asarray(self.metadata['split'] == 'test')
                idxs = idxs[~self.ood_mask & test_mask]
            elif split == 'id_val':
                val_mask = np.asarray(self.metadata['split'] == 'val')
                idxs = idxs[~self.ood_mask & val_mask]
            else:
                split_mask = np.asarray(self.metadata['split'] == split)
                idxs = idxs[~self.ood_mask & split_mask]

            if self.oracle_training_set and split == 'train':
                test_mask = np.asarray(self.metadata['split'] == 'test')
                unused_ood_idxs = np.arange(len(self.metadata))[self.ood_mask
                                                                & ~test_mask]
                subsample_unused_ood_idxs = subsample_idxs(unused_ood_idxs,
                                                           num=len(idxs) // 2,
                                                           seed=self.seed + 2)
                subsample_train_idxs = subsample_idxs(idxs.copy(),
                                                      num=len(idxs) // 2,
                                                      seed=self.seed + 3)
                idxs = np.concatenate(
                    [subsample_unused_ood_idxs, subsample_train_idxs])
            self._split_array[idxs] = self._split_dict[split]

        if not use_ood_val:
            self._split_dict = {
                'train': 0,
                'val': 1,
                'id_test': 2,
                'ood_val': 3,
                'test': 4
            }
            self._split_names = {
                'train': 'Train',
                'val': 'ID Val',
                'id_test': 'ID Test',
                'ood_val': 'OOD Val',
                'test': 'OOD Test'
            }

        # filter out sequestered images from full dataset
        seq_mask = np.asarray(self.metadata['split'] == 'seq')
        # take out the sequestered images
        self._split_array = self._split_array[~seq_mask]
        self.full_idxs = np.arange(len(self.metadata))[~seq_mask]

        self._y_array = np.asarray(
            [self.category_to_idx[y] for y in list(self.metadata['category'])])
        self.metadata['y'] = self._y_array
        self._y_array = torch.from_numpy(self._y_array).long()[~seq_mask]
        self._y_size = 1
        self._n_classes = 62

        # convert region to idxs
        all_regions = list(self.metadata['region'].unique())
        region_to_region_idx = {
            region: i
            for i, region in enumerate(all_regions)
        }
        self._metadata_map = {'region': all_regions}
        region_idxs = [
            region_to_region_idx[region]
            for region in self.metadata['region'].tolist()
        ]
        self.metadata['region'] = region_idxs

        # make a year column in metadata
        year_array = -1 * np.ones(len(self.metadata))
        ts = pd.to_datetime(self.metadata['timestamp'])
        for year in range(2002, 2018):
            year_mask = np.asarray(ts >= datetime.datetime(year, 1, 1, tzinfo=pytz.UTC)) \
                        & np.asarray(ts < datetime.datetime(year+1, 1, 1, tzinfo=pytz.UTC))
            year_array[year_mask] = year - 2002
        self.metadata['year'] = year_array
        self._metadata_map['year'] = list(range(2002, 2018))

        self._metadata_fields = ['region', 'year', 'y']
        self._metadata_array = torch.from_numpy(self.metadata[
            self._metadata_fields].astype(int).to_numpy()).long()[~seq_mask]

        self._eval_groupers = {
            'year':
            CombinatorialGrouper(dataset=self, groupby_fields=['year']),
            'region':
            CombinatorialGrouper(dataset=self, groupby_fields=['region']),
        }

        self._metric = Accuracy()
        super().__init__(root_dir, download, split_scheme)

    def get_input(self, idx):
        """
       Returns x for a given idx.
       """
        idx = self.full_idxs[idx]
        batch_idx = idx // self.chunk_size
        within_batch_idx = idx % self.chunk_size
        img_batch = np.load(self.root / f'rgb_all_imgs_{batch_idx}.npy',
                            mmap_mode='r')
        return img_batch[within_batch_idx]

    def eval(self, y_pred, y_true, metadata):
        # Overall evaluation + evaluate by year
        all_results, all_results_str = self.standard_group_eval(
            self._metric, self._eval_groupers['year'], y_pred, y_true,
            metadata)
        # Evaluate by region and ignore the "Other" region
        region_grouper = self._eval_groupers['region']
        region_results = self._metric.compute_group_wise(
            y_pred, y_true, region_grouper.metadata_to_group(metadata),
            region_grouper.n_groups)
        all_results[f'{self._metric.name}_worst_year'] = all_results.pop(
            self._metric.worst_group_metric_field)
        region_metric_list = []
        for group_idx in range(region_grouper.n_groups):
            group_str = region_grouper.group_field_str(group_idx)
            group_metric = region_results[self._metric.group_metric_field(
                group_idx)]
            group_counts = region_results[self._metric.group_count_field(
                group_idx)]
            all_results[f'{self._metric.name}_{group_str}'] = group_metric
            all_results[f'count_{group_str}'] = group_counts
            if region_results[self._metric.group_count_field(
                    group_idx)] == 0 or "Other" in group_str:
                continue
            all_results_str += (
                f'  {region_grouper.group_str(group_idx)}  '
                f"[n = {region_results[self._metric.group_count_field(group_idx)]:6.0f}]:\t"
                f"{self._metric.name} = {region_results[self._metric.group_metric_field(group_idx)]:5.3f}\n"
            )
            region_metric_list.append(
                region_results[self._metric.group_metric_field(group_idx)])
        all_results[f'{self._metric.name}_worst_region'] = self._metric.worst(
            region_metric_list)
        all_results_str += f"Worst-group {self._metric.name}: {all_results[f'{self._metric.name}_worst_region']:.3f}\n"

        return all_results, all_results_str
Beispiel #14
0
    def __init__(self,
                 root_dir='data',
                 download=False,
                 split_scheme='official'):
        self._dataset_name = 'camelyon17'
        self._version = '1.0'
        self._download_url = 'https://worksheets.codalab.org/rest/bundles/0xe45e15f39fb54e9d9e919556af67aabe/contents/blob/'
        self._compressed_size = 10_658_709_504
        self._data_dir = self.initialize_data_dir(root_dir, download)
        self._original_resolution = (96, 96)

        # Read in metadata
        self._metadata_df = pd.read_csv(os.path.join(self._data_dir,
                                                     'metadata.csv'),
                                        index_col=0,
                                        dtype={'patient': 'str'})

        # Get the y values
        self._y_array = torch.LongTensor(self._metadata_df['tumor'].values)
        self._y_size = 1
        self._n_classes = 2

        # Get filenames
        self._input_array = [
            f'patches/patient_{patient}_node_{node}/patch_patient_{patient}_node_{node}_x_{x}_y_{y}.png'
            for patient, node, x, y in self._metadata_df.
            loc[:, ['patient', 'node', 'x_coord', 'y_coord']].itertuples(
                index=False, name=None)
        ]

        # Extract splits
        # Note that the hospital numbering here is different from what's in the paper,
        # where to avoid confusing readers we used a 1-indexed scheme and just labeled the test hospital as 5.
        # Here, the numbers are 0-indexed.
        test_center = 2
        val_center = 1

        self._split_dict = {'train': 0, 'id_val': 1, 'test': 2, 'val': 3}
        self._split_names = {
            'train': 'Train',
            'id_val': 'Validation (ID)',
            'test': 'Test',
            'val': 'Validation (OOD)',
        }
        centers = self._metadata_df['center'].values.astype('long')
        num_centers = int(np.max(centers)) + 1
        val_center_mask = (self._metadata_df['center'] == val_center)
        test_center_mask = (self._metadata_df['center'] == test_center)
        self._metadata_df.loc[val_center_mask,
                              'split'] = self.split_dict['val']
        self._metadata_df.loc[test_center_mask,
                              'split'] = self.split_dict['test']

        self._split_scheme = split_scheme
        if self._split_scheme == 'official':
            pass
        elif self._split_scheme == 'in-dist':
            # For the in-distribution oracle,
            # we move slide 23 (corresponding to patient 042, node 3 in the original dataset)
            # from the test set to the training set
            slide_mask = (self._metadata_df['slide'] == 23)
            self._metadata_df.loc[slide_mask,
                                  'split'] = self.split_dict['train']
        else:
            raise ValueError(
                f'Split scheme {self._split_scheme} not recognized')
        self._split_array = self._metadata_df['split'].values

        self._metadata_array = torch.stack(
            (torch.LongTensor(centers),
             torch.LongTensor(
                 self._metadata_df['slide'].values), self._y_array),
            dim=1)
        self._metadata_fields = ['hospital', 'slide', 'y']

        self._eval_grouper = CombinatorialGrouper(dataset=self,
                                                  groupby_fields=['slide'])

        self._metric = Accuracy()

        super().__init__(root_dir, download, split_scheme)
    def __init__(self,
                 root_dir='data',
                 download=False,
                 split_scheme='official',
                 oracle_training_set=False,
                 seed=111,
                 use_ood_val=False):
        self._compressed_size = 70_000_000_000
        self._data_dir = self.initialize_data_dir(root_dir, download)

        self._split_dict = {
            'train': 0,
            'id_val': 1,
            'id_test': 2,
            'val': 3,
            'test': 4
        }
        self._split_names = {
            'train': 'Train',
            'id_val': 'ID Val',
            'id_test': 'ID Test',
            'val': 'OOD Val',
            'test': 'OOD Test'
        }
        if split_scheme == 'official':
            split_scheme = 'time_after_2016'
        self._split_scheme = split_scheme
        self.oracle_training_set = oracle_training_set

        self.root = Path(self._data_dir)
        self.seed = int(seed)
        self._original_resolution = (224, 224)

        self.category_to_idx = {cat: i for i, cat in enumerate(categories)}

        self.metadata = pd.read_csv(self.root / 'rgb_metadata.csv')
        country_codes_df = pd.read_csv(self.root / 'country_code_mapping.csv')
        countrycode_to_region = {
            k: v
            for k, v in zip(country_codes_df['alpha-3'],
                            country_codes_df['region'])
        }
        regions = [
            countrycode_to_region.get(code, 'Other')
            for code in self.metadata['country_code'].to_list()
        ]
        self.metadata['region'] = regions
        all_countries = self.metadata['country_code']

        self.num_chunks = 101
        self.chunk_size = len(self.metadata) // (self.num_chunks - 1)

        if self._split_scheme.startswith('time_after'):
            year = int(self._split_scheme.split('_')[2])
            year_dt = datetime.datetime(year, 1, 1, tzinfo=pytz.UTC)
            self.test_ood_mask = np.asarray(
                pd.to_datetime(self.metadata['timestamp']) >= year_dt)
            # use 3 years of the training set as validation
            year_minus_3_dt = datetime.datetime(year - 3,
                                                1,
                                                1,
                                                tzinfo=pytz.UTC)
            self.val_ood_mask = np.asarray(
                pd.to_datetime(self.metadata['timestamp']) >= year_minus_3_dt
            ) & ~self.test_ood_mask
            self.ood_mask = self.test_ood_mask | self.val_ood_mask
        else:
            raise ValueError(
                f"Not supported: self._split_scheme = {self._split_scheme}")

        self._split_array = -1 * np.ones(len(self.metadata))
        for split in self._split_dict.keys():
            idxs = np.arange(len(self.metadata))
            if split == 'test':
                test_mask = np.asarray(self.metadata['split'] == 'test')
                idxs = idxs[self.test_ood_mask & test_mask]
            elif split == 'val':
                val_mask = np.asarray(self.metadata['split'] == 'val')
                idxs = idxs[self.val_ood_mask & val_mask]
            elif split == 'id_test':
                test_mask = np.asarray(self.metadata['split'] == 'test')
                idxs = idxs[~self.ood_mask & test_mask]
            elif split == 'id_val':
                val_mask = np.asarray(self.metadata['split'] == 'val')
                idxs = idxs[~self.ood_mask & val_mask]
            else:
                split_mask = np.asarray(self.metadata['split'] == split)
                idxs = idxs[~self.ood_mask & split_mask]

            if self.oracle_training_set and split == 'train':
                test_mask = np.asarray(self.metadata['split'] == 'test')
                unused_ood_idxs = np.arange(len(self.metadata))[self.ood_mask
                                                                & ~test_mask]
                subsample_unused_ood_idxs = subsample_idxs(unused_ood_idxs,
                                                           num=len(idxs) // 2,
                                                           seed=self.seed + 2)
                subsample_train_idxs = subsample_idxs(idxs.copy(),
                                                      num=len(idxs) // 2,
                                                      seed=self.seed + 3)
                idxs = np.concatenate(
                    [subsample_unused_ood_idxs, subsample_train_idxs])
            self._split_array[idxs] = self._split_dict[split]

        if not use_ood_val:
            self._split_dict = {
                'train': 0,
                'val': 1,
                'id_test': 2,
                'ood_val': 3,
                'test': 4
            }
            self._split_names = {
                'train': 'Train',
                'val': 'ID Val',
                'id_test': 'ID Test',
                'ood_val': 'OOD Val',
                'test': 'OOD Test'
            }

        # filter out sequestered images from full dataset
        seq_mask = np.asarray(self.metadata['split'] == 'seq')
        # take out the sequestered images
        self._split_array = self._split_array[~seq_mask]
        self.full_idxs = np.arange(len(self.metadata))[~seq_mask]

        self._y_array = np.asarray(
            [self.category_to_idx[y] for y in list(self.metadata['category'])])
        self.metadata['y'] = self._y_array
        self._y_array = torch.from_numpy(self._y_array).long()[~seq_mask]
        self._y_size = 1
        self._n_classes = 62

        # convert region to idxs
        all_regions = list(self.metadata['region'].unique())
        region_to_region_idx = {
            region: i
            for i, region in enumerate(all_regions)
        }
        self._metadata_map = {'region': all_regions}
        region_idxs = [
            region_to_region_idx[region]
            for region in self.metadata['region'].tolist()
        ]
        self.metadata['region'] = region_idxs

        # make a year column in metadata
        year_array = -1 * np.ones(len(self.metadata))
        ts = pd.to_datetime(self.metadata['timestamp'])
        for year in range(2002, 2018):
            year_mask = np.asarray(ts >= datetime.datetime(year, 1, 1, tzinfo=pytz.UTC)) \
                        & np.asarray(ts < datetime.datetime(year+1, 1, 1, tzinfo=pytz.UTC))
            year_array[year_mask] = year - 2002
        self.metadata['year'] = year_array
        self._metadata_map['year'] = list(range(2002, 2018))

        self._metadata_fields = ['region', 'year', 'y']
        self._metadata_array = torch.from_numpy(self.metadata[
            self._metadata_fields].astype(int).to_numpy()).long()[~seq_mask]

        self._eval_groupers = {
            'year':
            CombinatorialGrouper(dataset=self, groupby_fields=['year']),
            'region':
            CombinatorialGrouper(dataset=self, groupby_fields=['region']),
        }

        self._metric = Accuracy()
        super().__init__(root_dir, download, split_scheme)
Beispiel #16
0
class AmazonDataset(WILDSDataset):
    """
    Amazon dataset.
    This is a modified version of the 2018 Amazon Reviews dataset.

    Supported `split_scheme`:
        'official': official split, which is equivalent to 'user'
        'user': shifts to unseen reviewers
        'time': shifts from reviews written before 2013 to reviews written after 2013
        'category_subpopulation': the training distribution is a random subset following the natural distribution, and the
                                  evaluation splits include each category uniformly (to the extent it is possible)
        '*_generalization': domain generalization setting where the domains are categories. train categories vary.
        '*_baseline': oracle baseline splits for user or time shifts

    Input (x):
        Review text of maximum token length of 512.

    Label (y):
        y is the star rating (0,1,2,3,4 corresponding to 1-5 stars)

    Metadata:
        reviewer: reviewer ID
        year: year in which the review was written
        category: product category
        product: product ID

    Website:
        https://nijianmo.github.io/amazon/index.html

    Original publication:
	@inproceedings{ni2019justifying,
	  author = {J. Ni and J. Li and J. McAuley},
	  booktitle = {Empirical Methods in Natural Language Processing (EMNLP)},
	  pages = {188--197},
	  title = {Justifying recommendations using distantly-labeled reviews and fine-grained aspects},
	  year = {2019},
	}

    License:
        None. However, the original authors request that the data be used for research purposes only.
    """
    def __init__(self, root_dir='data', download=False, split_scheme='official'):
        # set variables
        self._dataset_name = 'amazon'
        self._version = '1.0'
        self._download_url = '' # REMOVED TO KEEP ANONYMITY
        self._compressed_size = 4_066_541_568
        # the official split is the user split
        if split_scheme=='official':
            split_scheme = 'user'
        self._split_scheme = split_scheme
        self._y_type = 'long'
        self._y_size = 1
        self._n_classes = 5
        # path
        self._data_dir = self.initialize_data_dir(root_dir, download)
        # Load data
        data_df = pd.read_csv(os.path.join(self.data_dir, 'reviews.csv'),
                              dtype={'reviewerID':str, 'asin':str, 'reviewTime':str,'unixReviewTime':int,
                                     'reviewText':str,'summary':str,'verified':bool,'category':str, 'reviewYear':int},
                              keep_default_na=False, na_values=[], quoting=csv.QUOTE_NONNUMERIC)
        split_df = pd.read_csv(os.path.join(self.data_dir, 'splits', f'{self.split_scheme}.csv'))
        is_in_dataset = split_df['split']!=NOT_IN_DATASET
        split_df = split_df[is_in_dataset]
        data_df = data_df[is_in_dataset]
        # Get arrays
        self._split_array = split_df['split'].values
        self._input_array = list(data_df['reviewText'])
        # Get metadata
        self._metadata_fields, self._metadata_array, self._metadata_map = self.load_metadata(data_df, self.split_array)
        # Get y from metadata
        self._y_array = getattr(self.metadata_array[:,self.metadata_fields.index('y')], self._y_type)()
        # Set split info
        self.initialize_split_dicts()
        # eval
        self.initialize_eval_grouper()
        self._metric = Accuracy()
        super().__init__(root_dir, download, split_scheme)

    def get_input(self, idx):
        return self._input_array[idx]

    def eval(self, y_pred, y_true, metadata):
        if self.split_scheme=='user':
            # first compute groupwise accuracies
            g = self._eval_grouper.metadata_to_group(metadata)
            results = {
                **self._metric.compute(y_pred, y_true),
                **self._metric.compute_group_wise(y_pred, y_true, g, self._eval_grouper.n_groups)
            }
            accs = []
            for group_idx in range(self._eval_grouper.n_groups):
                group_str = self._eval_grouper.group_field_str(group_idx)
                group_metric = results.pop(self._metric.group_metric_field(group_idx))
                group_counts = results.pop(self._metric.group_count_field(group_idx))
                results[f'{self._metric.name}_{group_str}'] = group_metric
                results[f'count_{group_str}'] = group_counts
                if group_counts>0:
                    accs.append(group_metric)
            accs = np.array(accs)
            results['10th_percentile_acc'] = np.percentile(accs, 10)
            results[f'{self._metric.worst_group_metric_field}'] = self._metric.worst(accs)
            results_str = (
                f"Average {self._metric.name}: {results[self._metric.agg_metric_field]:.3f}\n"
                f"10th percentile {self._metric.name}: {results['10th_percentile_acc']:.3f}\n"
                f"Worst-group {self._metric.name}: {results[self._metric.worst_group_metric_field]:.3f}\n"
            )
            return results, results_str
        else:
            return self.standard_group_eval(
                self._metric,
                self._eval_grouper,
                y_pred, y_true, metadata)

    def initialize_split_dicts(self):
        if self.split_scheme in ('user', 'time') or self.split_scheme.endswith('_generalization'): #category generalization
            self._split_dict = {'train': 0, 'val': 1, 'id_val': 2, 'test': 3, 'id_test': 4}
            self._split_names = {'train': 'Train', 'val': 'Validation (OOD)', 'id_val': 'Validation (ID)', 'test':'Test (OOD)', 'id_test': 'Test (ID)'}
        elif self.split_scheme in ('category_subpopulation', ):
            # use defaults
            pass
        elif self.split_scheme.endswith('_baseline'):
            # use defaults
            pass
        else:
            raise ValueError(f'Split scheme {self.split_scheme} not recognized')

    def load_metadata(self, data_df, split_array):
        # Get metadata
        columns = ['reviewerID','asin','category','reviewYear', 'overall']
        metadata_fields = ['user', 'product', 'category', 'year','y']
        metadata_df = data_df[columns].copy()
        metadata_df.columns = metadata_fields
        sort_idx = np.argsort(split_array)
        ordered_maps = {}
        for field in ['user', 'product', 'category']:
            # map to IDs in the order of split values
            ordered_maps[field] = pd.unique(metadata_df.iloc[sort_idx][field])
        ordered_maps['y'] = range(1,6)
        ordered_maps['year'] = range(metadata_df['year'].min(), metadata_df['year'].max()+1)
        metadata_map, metadata = map_to_id_array(metadata_df, ordered_maps)
        return metadata_fields, torch.from_numpy(metadata.astype('long')), metadata_map

    def initialize_eval_grouper(self):
        if self.split_scheme=='user':
            self._eval_grouper = CombinatorialGrouper(
                dataset=self,
                groupby_fields=['user'])
        elif self.split_scheme.endswith('generalization') or self.split_scheme=='category_subpopulation':
            self._eval_grouper = CombinatorialGrouper(
                dataset=self,
                groupby_fields=['category'])
        elif self.split_scheme in ('time', 'time_baseline'):
            self._eval_grouper = CombinatorialGrouper(
                dataset=self,
                groupby_fields=['year'])
        elif self.split_scheme.endswith('_baseline'): # user baselines
            self._eval_grouper = CombinatorialGrouper(
                dataset=self,
                groupby_fields=['user'])
        else:
            raise ValueError(f'Split scheme {self.split_scheme} not recognized')
Beispiel #17
0
    def __init__(self,
                 root_dir='data',
                 download=False,
                 split_scheme='official'):
        self._dataset_name = 'cmnist4'
        self._version = '1.0'
        self._data_dir = self.initialize_data_dir(root_dir, download)
        self._original_resolution = (28, 28)

        # Read in metadata
        self._metadata_df = pd.read_csv(
            os.path.join(self._data_dir, 'metadata.csv'),
            index_col=0,
            # dtype={'patient': 'str'}
        )

        # Get the y values
        self._y_array = torch.LongTensor(self._metadata_df['digit'].values)
        self._y_array = (self._y_array == 6) + (self._y_array == 9) * 2
        self._y_size = 3
        self._n_classes = 3

        # Get filenames
        self._input_array = [
            f'images/env_{env}/digit_{digit}/{image}.pt'
            for image, digit, env in self._metadata_df.
            loc[:,
                ['image', 'digit', 'env']].itertuples(index=False, name=None)
        ]

        # Extract splits
        # Note that the hospital numbering here is different from what's in the paper,
        # where to avoid confusing readers we used a 1-indexed scheme and just labeled the test hospital as 5.
        # Here, the numbers are 0-indexed.
        test_env = 4
        val_env = 3

        self._split_dict = {'train': 0, 'id_val': 1, 'test': 2, 'val': 3}
        self._split_names = {
            'train': 'Train',
            'id_val': 'Validation (ID)',
            'test': 'Test',
            'val': 'Validation (OOD)',
        }
        envs = self._metadata_df['env'].values.astype('long')
        val_env_mask = (self._metadata_df['env'] == val_env)
        test_env_mask = (self._metadata_df['env'] == test_env)
        self._metadata_df.loc[val_env_mask, 'split'] = self.split_dict['val']
        self._metadata_df.loc[test_env_mask, 'split'] = self.split_dict['test']

        self._split_scheme = split_scheme
        if self._split_scheme != 'official':
            raise ValueError(
                f'Split scheme {self._split_scheme} not recognized')

        self._split_array = self._metadata_df['split'].values

        self._metadata_array = torch.stack(
            [torch.LongTensor(envs), self._y_array], dim=1)
        self._metadata_fields = ['env', 'y']

        self._eval_grouper = CombinatorialGrouper(dataset=self,
                                                  groupby_fields=['env'])

        self._metric = Accuracy()

        super().__init__(root_dir, download, split_scheme)
    'ogb-molpcba': OGBPCBADataset,
    'poverty': PovertyMapDataset,
    'fmow': FMoWDataset,
    'bdd100k': BDD100KDataset,
}

losses = {
    'cross_entropy':
    ElementwiseLoss(loss_fn=nn.CrossEntropyLoss(reduction='none')),
    'mse': MSE(name='loss'),
    'multitask_bce':
    MultiTaskLoss(loss_fn=nn.BCEWithLogitsLoss(reduction='none')),
}

algo_log_metrics = {
    'accuracy': Accuracy(),
    'mse': MSE(),
    'multitask_accuracy': MultiTaskAccuracy(),
    None: None,
}

# see initialize_*() functions for correspondence
transforms = [
    'bert', 'image_base', 'image_resize_and_center_crop', 'poverty_train'
]
models = [
    'resnet18_ms', 'resnet50', 'resnet34', 'wideresnet50', 'densenet121',
    'bert-base-uncased', 'gin-virtual', 'logistic_regression'
]
algorithms = ['ERM', 'groupDRO', 'deepCORAL', 'IRM']
optimizers = ['SGD', 'Adam', 'AdamW']
    def __init__(self,
                 root_dir='data',
                 download=False,
                 split_scheme='official'):

        self._dataset_name = 'iwildcam'
        self._version = '1.0'
        self._split_scheme = split_scheme
        if self._split_scheme != 'official':
            raise ValueError(
                f'Split scheme {self._split_scheme} not recognized')

        # path
        self._download_url = ''
        self._compressed_size = 90_094_666_806
        self._data_dir = Path(self.initialize_data_dir(root_dir, download))

        # Load splits
        train_df = pd.read_csv(self._data_dir / 'train.csv')
        val_trans_df = pd.read_csv(self._data_dir / 'val_trans.csv')
        test_trans_df = pd.read_csv(self._data_dir / 'test_trans.csv')
        val_cis_df = pd.read_csv(self._data_dir / 'val_cis.csv')
        test_cis_df = pd.read_csv(self._data_dir / 'test_cis.csv')

        # Merge all dfs
        train_df['split'] = 'train'
        val_trans_df['split'] = 'val'
        test_trans_df['split'] = 'test'
        val_cis_df['split'] = 'id_val'
        test_cis_df['split'] = 'id_test'
        df = pd.concat(
            [train_df, val_trans_df, test_trans_df, test_cis_df, val_cis_df])

        # Splits
        data = {}
        self._split_dict = {
            'train': 0,
            'val': 1,
            'test': 2,
            'id_val': 3,
            'id_test': 4
        }
        self._split_names = {
            'train': 'Train',
            'val': 'Validation (OOD/Trans)',
            'test': 'Test (OOD/Trans)',
            'id_val': 'Validation (ID/Cis)',
            'id_test': 'Test (ID/Cis)'
        }

        df['split_id'] = df['split'].apply(lambda x: self._split_dict[x])
        self._split_array = df['split_id'].values

        # Filenames
        self._input_array = df['filename'].values

        # Labels
        unique_categories = np.unique(df['category_id'])
        self._n_classes = len(unique_categories)
        category_to_label = dict([
            (i, j) for i, j in zip(unique_categories, range(self._n_classes))
        ])
        label_to_category = dict([(v, k)
                                  for k, v in category_to_label.items()])
        self._y_array = torch.tensor(
            df['category_id'].apply(lambda x: category_to_label[x]).values)
        self._y_size = 1

        # Location/group info
        location_ids = df['location']
        locations = np.unique(location_ids)
        n_groups = len(locations)
        location_to_group_id = {locations[i]: i for i in range(n_groups)}
        df['group_id'] = df['location'].apply(
            lambda x: location_to_group_id[x])

        self._n_groups = n_groups
        self._metadata_array = torch.tensor(
            np.stack([df['group_id'].values, self.y_array], axis=1))
        self._metadata_fields = ['location', 'y']
        # eval grouper
        self._eval_grouper = CombinatorialGrouper(dataset=self,
                                                  groupby_fields=(['location'
                                                                   ]))

        self._metrics = [
            Accuracy(),
            Recall(average='macro'),
            Recall(average='weighted'),
            F1(average='macro'),
            F1(average='weighted')
        ]
        super().__init__(root_dir, download, split_scheme)
Beispiel #20
0
    def __init__(self, root_dir='data', download=False, split_scheme='official'):
        self._dataset_name = 'vlcs'
        self._version = '1.0'
        # self._download_url = 'https://drive.google.com/uc?id=1skwblH1_okBwxWxmRsp9_qi15hyPpxg8'
        self._data_dir = self.initialize_data_dir(root_dir, download)
        self._resolution = (224, 224)

        # Read in metadata
        self._metadata_df = pd.read_csv(
            os.path.join(self._data_dir, 'metadata.csv'),
            index_col=0
        )

        # Get the y values
        self._label_map = {
            'bird': 0,
            'car': 1,
            'chair': 2,
            'dog': 3,
            'person': 4
        }
        self._label_array = self._metadata_df['label'].values
        self._y_array = torch.LongTensor([self._label_map[y] for y in self._label_array])
        self._y_size = 1
        self._n_classes = 5

        # Get filenames
        self._input_array = [
            f'{env}/{label}/{image}'
            for image, label, env in
            self._metadata_df.loc[:, ['image', 'label', 'env']].itertuples(index=False, name=None)]

        test_env = ''  #'VOC2007'
        val_env = 'VOC2007'

        self._split_dict = {
            'train': 0,
            'id_val': 1,
            'test': 2,
            'val': 3
        }
        self._split_names = {
            'train': 'Train',
            'id_val': 'Validation (ID)',
            'test': 'Test',
            'val': 'Validation (OOD)',
        }

        env_map = {
            'SUN09': 0,
            'LabelMe': 1,
            'Caltech101': 2,
            'VOC2007': 3
        }
        env_names = self._metadata_df['env'].values
        envs = [env_map[name] for name in env_names]

        val_env_mask = (self._metadata_df['env'] == val_env)
        test_env_mask = (self._metadata_df['env'] == test_env)
        self._metadata_df.loc[val_env_mask, 'split'] = self.split_dict['val']
        self._metadata_df.loc[test_env_mask, 'split'] = self.split_dict['test']

        self._split_scheme = split_scheme
        if self._split_scheme != 'official':
            raise ValueError(f'Split scheme {self._split_scheme} not recognized')
        self._split_array = self._metadata_df['split'].values

        self._metadata_array = torch.stack(
            (torch.LongTensor(envs),
             self._y_array),
            dim=1)
        self._metadata_fields = ['env', 'y']

        self._eval_grouper = CombinatorialGrouper(
            dataset=self,
            groupby_fields=['env'])

        self._metric = Accuracy()

        super().__init__(root_dir, download, split_scheme)
Beispiel #21
0
class CivilCommentsDataset(WILDSDataset):
    """
    The CivilComments-wilds toxicity classification dataset.
    This is a modified version of the original CivilComments dataset.

    Supported `split_scheme`:
        'official'

    Input (x):
        A comment on an online article, comprising one or more sentences of text.

    Label (y):
        y is binary. It is 1 if the comment was been rated as toxic by a majority of the crowdworkers who saw that comment, and 0 otherwise.

    Metadata:
        Each comment is annotated with the following binary indicators:
            - male
            - female
            - LGBTQ
            - christian
            - muslim
            - other_religions
            - black
            - white
            - identity_any
            - severe_toxicity
            - obscene
            - threat
            - insult
            - identity_attack
            - sexual_explicit

    Website:
        https://www.kaggle.com/c/jigsaw-unintended-bias-in-toxicity-classification

    Original publication:
        @inproceedings{borkan2019nuanced,
          title={Nuanced metrics for measuring unintended bias with real data for text classification},
          author={Borkan, Daniel and Dixon, Lucas and Sorensen, Jeffrey and Thain, Nithum and Vasserman, Lucy},
          booktitle={Companion Proceedings of The 2019 World Wide Web Conference},
          pages={491--500},
          year={2019}
        }

    License:
        This dataset is in the public domain and is distributed under CC0.
        https://creativecommons.org/publicdomain/zero/1.0/
    """
    def __init__(self,
                 root_dir='data',
                 download=False,
                 split_scheme='official'):
        self._dataset_name = 'civilcomments'
        self._version = '1.0'
        self._download_url = ''
        self._compressed_size = 90_644_480
        self._data_dir = self.initialize_data_dir(root_dir, download)

        # Read in metadata
        self._metadata_df = pd.read_csv(os.path.join(
            self._data_dir, 'all_data_with_identities.csv'),
                                        index_col=0)

        # Get the y values
        self._y_array = torch.LongTensor(
            self._metadata_df['toxicity'].values >= 0.5)
        self._y_size = 1
        self._n_classes = 2

        # Extract text
        self._text_array = list(self._metadata_df['comment_text'])

        # Extract splits
        self._split_scheme = split_scheme
        if self._split_scheme != 'official':
            raise ValueError(
                f'Split scheme {self._split_scheme} not recognized')
        # metadata_df contains split names in strings, so convert them to ints
        for split in self.split_dict:
            split_indices = self._metadata_df['split'] == split
            self._metadata_df.loc[split_indices,
                                  'split'] = self.split_dict[split]
        self._split_array = self._metadata_df['split'].values

        # Extract metadata
        self._identity_vars = [
            'male', 'female', 'LGBTQ', 'christian', 'muslim',
            'other_religions', 'black', 'white'
        ]
        self._auxiliary_vars = [
            'identity_any', 'severe_toxicity', 'obscene', 'threat', 'insult',
            'identity_attack', 'sexual_explicit'
        ]

        self._metadata_array = torch.cat(
            (torch.LongTensor(
                (self._metadata_df.loc[:, self._identity_vars] >= 0.5).values),
             torch.LongTensor((self._metadata_df.loc[:, self._auxiliary_vars]
                               >= 0.5).values), self._y_array.reshape(
                                   (-1, 1))),
            dim=1)
        self._metadata_fields = self._identity_vars + self._auxiliary_vars + [
            'y'
        ]

        self._eval_groupers = [
            CombinatorialGrouper(dataset=self,
                                 groupby_fields=[identity_var, 'y'])
            for identity_var in self._identity_vars
        ]
        self._metric = Accuracy()

        super().__init__(root_dir, download, split_scheme)

    def get_input(self, idx):
        return self._text_array[idx]

    def eval(self, y_pred, y_true, metadata):
        results = {
            **self._metric.compute(y_pred, y_true),
        }
        results_str = f"Average {self._metric.name}: {results[self._metric.agg_metric_field]:.3f}\n"
        # Each eval_grouper is over label + a single identity
        # We only want to keep the groups where the identity is positive
        # The groups are:
        #   Group 0: identity = 0, y = 0
        #   Group 1: identity = 1, y = 0
        #   Group 2: identity = 0, y = 1
        #   Group 3: identity = 1, y = 1
        # so this means we want only groups 1 and 3.
        worst_group_metric = None
        for identity_var, eval_grouper in zip(self._identity_vars,
                                              self._eval_groupers):
            g = eval_grouper.metadata_to_group(metadata)
            group_results = {
                **self._metric.compute_group_wise(y_pred, y_true, g, eval_grouper.n_groups)
            }
            results_str += f"  {identity_var:20s}"
            for group_idx in range(eval_grouper.n_groups):
                group_str = eval_grouper.group_field_str(group_idx)
                if f'{identity_var}:1' in group_str:
                    group_metric = group_results[
                        self._metric.group_metric_field(group_idx)]
                    group_counts = group_results[
                        self._metric.group_count_field(group_idx)]
                    results[f'{self._metric.name}_{group_str}'] = group_metric
                    results[f'count_{group_str}'] = group_counts
                    if f'y:0' in group_str:
                        label_str = 'non_toxic'
                    else:
                        label_str = 'toxic'
                    results_str += (
                        f"   {self._metric.name} on {label_str}: {group_metric:.3f}"
                        f" (n = {results[f'count_{group_str}']:6.0f}) ")
                    if worst_group_metric is None:
                        worst_group_metric = group_metric
                    else:
                        worst_group_metric = self._metric.worst(
                            [worst_group_metric, group_metric])
            results_str += f"\n"
        results[
            f'{self._metric.worst_group_metric_field}'] = worst_group_metric
        results_str += f"Worst-group {self._metric.name}: {worst_group_metric:.3f}\n"

        return results, results_str
Beispiel #22
0
    def __init__(self,
                 root_dir='data',
                 download=False,
                 split_scheme='official'):
        self._dataset_name = 'civilcomments'
        self._version = '1.0'
        self._download_url = ''
        self._compressed_size = 90_644_480
        self._data_dir = self.initialize_data_dir(root_dir, download)

        # Read in metadata
        self._metadata_df = pd.read_csv(os.path.join(
            self._data_dir, 'all_data_with_identities.csv'),
                                        index_col=0)

        # Get the y values
        self._y_array = torch.LongTensor(
            self._metadata_df['toxicity'].values >= 0.5)
        self._y_size = 1
        self._n_classes = 2

        # Extract text
        self._text_array = list(self._metadata_df['comment_text'])

        # Extract splits
        self._split_scheme = split_scheme
        if self._split_scheme != 'official':
            raise ValueError(
                f'Split scheme {self._split_scheme} not recognized')
        # metadata_df contains split names in strings, so convert them to ints
        for split in self.split_dict:
            split_indices = self._metadata_df['split'] == split
            self._metadata_df.loc[split_indices,
                                  'split'] = self.split_dict[split]
        self._split_array = self._metadata_df['split'].values

        # Extract metadata
        self._identity_vars = [
            'male', 'female', 'LGBTQ', 'christian', 'muslim',
            'other_religions', 'black', 'white'
        ]
        self._auxiliary_vars = [
            'identity_any', 'severe_toxicity', 'obscene', 'threat', 'insult',
            'identity_attack', 'sexual_explicit'
        ]

        self._metadata_array = torch.cat(
            (torch.LongTensor(
                (self._metadata_df.loc[:, self._identity_vars] >= 0.5).values),
             torch.LongTensor((self._metadata_df.loc[:, self._auxiliary_vars]
                               >= 0.5).values), self._y_array.reshape(
                                   (-1, 1))),
            dim=1)
        self._metadata_fields = self._identity_vars + self._auxiliary_vars + [
            'y'
        ]

        self._eval_groupers = [
            CombinatorialGrouper(dataset=self,
                                 groupby_fields=[identity_var, 'y'])
            for identity_var in self._identity_vars
        ]
        self._metric = Accuracy()

        super().__init__(root_dir, download, split_scheme)