Ejemplo n.º 1
0
def make_tuple_from_data(train: em.DataTuple, test: em.DataTuple,
                         pred_s: bool) -> Tuple[em.DataTuple, em.DataTuple]:
    train_x = train.x
    test_x = test.x

    if pred_s:
        train_y = train.s
        test_y = test.s
    else:
        train_y = train.y
        test_y = test.y

    return em.DataTuple(x=train_x, s=train.s,
                        y=train_y), em.DataTuple(x=test_x, s=test.s, y=test_y)
Ejemplo n.º 2
0
    def inference_epoch_end(self, outputs: EPOCH_OUTPUT,
                            stage: Stage) -> Dict[str, float]:
        targets_all = aggregate_over_epoch(outputs=outputs, metric="targets")
        subgroup_inf_all = aggregate_over_epoch(outputs=outputs,
                                                metric="subgroup_inf")
        logits_y_all = aggregate_over_epoch(outputs=outputs, metric="logits_y")

        preds_y_all = hard_prediction(logits_y_all)

        dt = em.DataTuple(
            x=pd.DataFrame(
                torch.rand_like(subgroup_inf_all).detach().cpu().numpy(),
                columns=["x0"],
            ),
            s=pd.DataFrame(subgroup_inf_all.detach().cpu().numpy(),
                           columns=["s"]),
            y=pd.DataFrame(targets_all.detach().cpu().numpy(), columns=["y"]),
        )

        return em.run_metrics(
            predictions=em.Prediction(
                hard=pd.Series(preds_y_all.detach().cpu().numpy())),
            actual=dt,
            metrics=[em.Accuracy(),
                     em.RenyiCorrelation(),
                     em.Yanovich()],
            per_sens_metrics=[em.Accuracy(),
                              em.ProbPos(),
                              em.TPR()],
        )
    def _inference_epoch_end(self, output_results: List[Dict[str, Tensor]],
                             stage: str) -> None:
        all_y = torch.cat([_r["y"] for _r in output_results], 0)
        all_s = torch.cat([_r["s"] for _r in output_results], 0)
        all_preds = torch.cat([_r["preds"] for _r in output_results], 0)

        dt = em.DataTuple(
            x=pd.DataFrame(torch.rand_like(all_s,
                                           dtype=float).detach().cpu().numpy(),
                           columns=["x0"]),
            s=pd.DataFrame(all_s.detach().cpu().numpy(), columns=["s"]),
            y=pd.DataFrame(all_y.detach().cpu().numpy(), columns=["y"]),
        )

        results = em.run_metrics(
            predictions=em.Prediction(
                hard=pd.Series(all_preds.detach().cpu().numpy())),
            actual=dt,
            metrics=[em.Accuracy(),
                     em.RenyiCorrelation(),
                     em.Yanovich()],
            per_sens_metrics=[em.Accuracy(),
                              em.ProbPos(),
                              em.TPR()],
        )

        tm_acc = self.val_acc if stage == "val" else self.test_acc
        acc = tm_acc.compute().item()
        results_dict = {f"{stage}/acc": acc}
        results_dict.update(
            {f"{stage}/{self.target}_{k}": v
             for k, v in results.items()})

        self.log_dict(results_dict)
    def inference_epoch_end(self, outputs: EPOCH_OUTPUT, stage: Stage) -> Dict[str, float]:
        targets_all = aggregate_over_epoch(outputs=outputs, metric="targets")
        subgroup_inf_all = aggregate_over_epoch(outputs=outputs, metric="subgroup_inf")
        preds_all = aggregate_over_epoch(outputs=outputs, metric="preds")

        mean_preds = preds_all.mean(-1)
        mean_preds_s0 = preds_all[subgroup_inf_all == 0].mean(-1)
        mean_preds_s1 = preds_all[subgroup_inf_all == 1].mean(-1)

        dt = em.DataTuple(
            x=pd.DataFrame(
                torch.rand_like(subgroup_inf_all, dtype=torch.float).detach().cpu().numpy(),
                columns=["x0"],
            ),
            s=pd.DataFrame(subgroup_inf_all.detach().cpu().numpy(), columns=["s"]),
            y=pd.DataFrame(targets_all.detach().cpu().numpy(), columns=["y"]),
        )

        results_dict = em.run_metrics(
            predictions=em.Prediction(hard=pd.Series((preds_all > 0).detach().cpu().numpy())),
            actual=dt,
            metrics=[em.Accuracy(), em.RenyiCorrelation(), em.Yanovich()],
            per_sens_metrics=[em.Accuracy(), em.ProbPos(), em.TPR()],
        )

        results_dict.update(
            {
                "DP_Gap": float((mean_preds_s0 - mean_preds_s1).abs().item()),
                "mean_pred": float(mean_preds.item()),
            }
        )
        return results_dict
Ejemplo n.º 5
0
    def __init__(self, ti: emvi.TorchImageDataset):
        self.ti = ti
        # Pull out the data components for compatibility with the extract_labels function
        self.x = ti.x
        self.s = ti.s
        self.y = ti.y

        dt = em.DataTuple(
            x=pd.DataFrame(np.random.randint(0, len(ti.s),
                                             size=(len(ti.s), 1)),
                           columns=list("x")),
            s=pd.DataFrame(ti.s.cpu().numpy(), columns=["s"]),
            y=pd.DataFrame(ti.y.cpu().numpy(), columns=["y"]),
        )
        self.iws = torch.tensor(
            em.compute_instance_weights(dt)["instance weights"].values)
def pytorch_data_to_dataframe(dataset, sens_attrs=None):
    """Load a pytorch dataset into a DataTuple consisting of Pandas DataFrames

    Args:
        dataset: PyTorch dataset
        sens_attrs: (optional) list of names of the sensitive attributes
    """
    # create data loader with one giant batch
    data_loader = DataLoader(dataset, batch_size=len(dataset), shuffle=False)
    # get the data
    data = next(iter(data_loader))
    # convert it to Pandas DataFrames
    data = [pd.DataFrame(tensor.detach().cpu().numpy()) for tensor in data]
    if sens_attrs is not None:
        data[1].columns = sens_attrs
    # create a DataTuple
    return em.DataTuple(x=data[0], s=data[1], y=data[2])
Ejemplo n.º 7
0
    def __init__(self, dataset: DataTuple, disc_features: List[str],
                 cont_features: List[str]):
        """Create DataTupleDataset."""
        disc_features = [
            feat for feat in disc_features if feat in dataset.x.columns
        ]
        self.disc_features = disc_features

        cont_features = [
            feat for feat in cont_features if feat in dataset.x.columns
        ]
        self.cont_features = cont_features
        self.feature_groups = dict(
            discrete=grouped_features_indexes(self.disc_features))

        self.x_disc = dataset.x[self.disc_features].to_numpy(dtype=np.float32)
        self.x_cont = dataset.x[self.cont_features].to_numpy(dtype=np.float32)

        (
            _,
            self.s,
            self.num,
            self.xdim,
            self.sdim,
            self.x_names,
            self.s_names,
        ) = _get_info(dataset)

        self.y = dataset.y.to_numpy(dtype=np.float32)

        self.ydim = dataset.y.shape[1]
        self.y_names = dataset.y.columns

        dt = em.DataTuple(
            x=pd.DataFrame(np.random.randint(0,
                                             len(self.s),
                                             size=(len(self.s), 1)),
                           columns=list("x")),
            s=pd.DataFrame(self.s, columns=["s"]),
            y=pd.DataFrame(self.y, columns=["y"]),
        )
        self.iws = torch.tensor(
            em.compute_instance_weights(dt)["instance weights"].values)
    def _run_epoch(
        self,
        model,
        dataloader,
        optimize=False,
        save_activations=False,
        reweight=False,
        bit_pretrained=False,
        adv_metrics=False,
    ):
        """Runs the model on a given dataloader.

        Note:
            The latter item in the returned tuple is what is necessary to run
            GEORGECluster.train and GEORGECluster.evaluate.

        Args:
            model(nn.Module): A PyTorch model.
            dataloader(DataLoader): The dataloader. The dataset within must
                subclass GEORGEDataset.
            optimize(bool, optional): If True, the model is trained on self.criterion.
            save_activations(bool, optional): If True, saves the activations in
                `outputs`. Default is False.
            bit_pretrained(bool, optional): If True, assumes bit_pretrained and does not evaluate
                performance metrics

        Returns:
            metrics(Dict[str, Any]) A dictionary object that stores the metrics defined
                in self.config['metric_types'].
            outputs(Dict[str, Any]) A dictionary object that stores artifacts necessary
                for model analysis, including labels, activations, and predictions.
        """
        dataset = dataloader.dataset
        self._check_dataset(dataset)
        type_to_num_classes = {
            label_type: dataset.get_num_classes(label_type)
            for label_type in LABEL_TYPES
            if label_type in dataset.Y_dict.keys()
        }
        outputs = defaultdict(list)
        activations_handle = self._init_activations_hook(model, outputs["activations"])
        if optimize:
            progress_prefix = "Training"
            model.train()
        else:
            progress_prefix = "Evaluation"
            model.eval()

        with tqdm(desc=progress_prefix, total=len(dataloader)) as pbar:
            for inputs, targets in dataloader:
                if self.use_cuda:
                    inputs, targets = move_to_device([inputs, targets], device=self.device)

                type_to_labels = {}
                for label_type in type_to_num_classes.keys():
                    type_to_labels[label_type] = targets[label_type]
                    outputs[label_type].append(targets[label_type])

                if optimize and not bit_pretrained:
                    logits = model(inputs)
                    loss_targets = targets["superclass"]
                    co = self.criterion(logits, loss_targets, targets["subclass"].long())
                    loss, (losses, corrects), _ = co
                    self.optimizer.zero_grad()
                    loss.backward()
                    self.optimizer.step()
                else:
                    with torch.no_grad():
                        logits = model(inputs)
                        loss_targets = targets["superclass"]
                        co = self.criterion(logits, loss_targets, targets["subclass"].long())
                        loss, (losses, corrects), _ = co

                if logits.size(1) == 1:
                    probs = logits.sigmoid().squeeze()
                    preds = probs.round()
                else:
                    probs = logits.softmax(dim=1)
                    preds = logits.argmax(dim=1)
                outputs["probs"].append(probs.detach().cpu())
                outputs["preds"].append(preds.detach().cpu())
                outputs["losses"].append(losses.detach().cpu())
                outputs["targets"].append(loss_targets.detach().cpu())

                pbar.set_postfix(loss=loss.item(), acc=corrects.float().mean())
                pbar.update()
                if not save_activations:
                    outputs['activations'].pop()  # delete activations

        outputs_cat = {}
        for key, value in outputs.items():
            if value:
                value = torch.cat(value, dim=0).detach().cpu().numpy()
            outputs_cat[key] = value
        del outputs

        superclass_labels = pd.DataFrame(outputs_cat["superclass"], columns=["superclass"])
        subclass_labels = pd.DataFrame(outputs_cat["true_subclass"], columns=["subclass"])
        actual = em.DataTuple(x=subclass_labels, s=subclass_labels, y=superclass_labels)
        predictions = em.Prediction(pd.Series(outputs_cat["preds"]))
        outputs_cat["metrics"] = compute_metrics(
            predictions=predictions, actual=actual, s_dim=dataset.get_num_classes("true_subclass")
        )

        if activations_handle:
            activations_handle.remove()

        return outputs_cat["metrics"], outputs_cat
def evaluate(
    cfg: Config,
    step: int,
    train_data: "Dataset[Tuple[Tensor, Tensor, Tensor]]",
    test_data: "Dataset[Tuple[Tensor, Tensor, Tensor]]",
    name: str,
    eval_on_recon: bool = True,
    pred_s: bool = False,
    save_to_csv: Optional[Path] = None,
    cluster_test_metrics: Optional[Dict[str, float]] = None,
    cluster_context_metrics: Optional[Dict[str, float]] = None,
):
    input_shape = next(iter(train_data))[0].shape
    additional_entries = {}
    if cluster_test_metrics is not None:
        additional_entries.update(
            {f"Clust/Test {k}": v
             for k, v in cluster_test_metrics.items()})
    if cluster_context_metrics is not None:
        additional_entries.update({
            f"Clust/Context {k}": v
            for k, v in cluster_context_metrics.items()
        })

    if cfg.data.dataset in (DS.cmnist, DS.celeba, DS.genfaces):

        train_loader = DataLoader(train_data,
                                  batch_size=cfg.fdm.batch_size,
                                  shuffle=True,
                                  pin_memory=True)
        test_loader = DataLoader(test_data,
                                 batch_size=cfg.fdm.test_batch_size,
                                 shuffle=False,
                                 pin_memory=True)

        clf: Classifier = fit_classifier(
            cfg,
            input_shape,
            train_data=train_loader,
            train_on_recon=eval_on_recon,
            pred_s=pred_s,
            test_data=test_loader,
        )

        preds, labels, sens = clf.predict_dataset(test_loader,
                                                  device=torch.device(
                                                      cfg.misc._device))
        preds = em.Prediction(hard=pd.Series(preds))
        if cfg.data.dataset == DS.cmnist:
            sens_name = "colour"
        elif cfg.data.dataset == DS.celeba:
            sens_name = cfg.data.celeba_sens_attr
        else:
            sens_name = "sens_Label"
        sens_pd = pd.DataFrame(sens.numpy().astype(np.float32),
                               columns=[sens_name])
        labels_pd = pd.DataFrame(labels, columns=["labels"])
        actual = em.DataTuple(x=sens_pd,
                              s=sens_pd,
                              y=sens_pd if pred_s else labels_pd)
        compute_metrics(
            cfg,
            preds,
            actual,
            name,
            "pytorch_classifier",
            step=step,
            save_to_csv=save_to_csv,
            results_csv=cfg.misc.results_csv,
            use_wandb=cfg.misc.use_wandb,
            additional_entries=additional_entries,
        )
    else:
        if not isinstance(train_data, em.DataTuple):
            train_data, test_data = get_data_tuples(train_data, test_data)

        train_data, test_data = make_tuple_from_data(train_data,
                                                     test_data,
                                                     pred_s=pred_s)
        for eth_clf in [em.LR(),
                        em.LRCV()]:  # , em.LRCV(), em.SVM(kernel="linear")]:
            preds = eth_clf.run(train_data, test_data)
            compute_metrics(
                cfg,
                preds,
                test_data,
                name,
                eth_clf.name,
                step=step,
                save_to_csv=save_to_csv,
                results_csv=cfg.misc.results_csv,
                use_wandb=cfg.misc.use_wandb,
                additional_entries=additional_entries,
            )