Example #1
0
    def score(
        self,
        data,
        metric="accuracy",
        break_ties="random",
        verbose=True,
        print_confusion_matrix=True,
        **kwargs,
    ):
        """Scores the predictive performance of the Classifier on all tasks

        Args:
            data: a Pytorch DataLoader, Dataset, or tuple with Tensors (X,Y):
                X: The input for the predict method
                Y: An [n] or [n, 1] torch.Tensor or np.ndarray of target labels
                    in {1,...,k}
            metric: A metric (string) with which to score performance or a
                list of such metrics
            break_ties: A tie-breaking policy (see Classifier._break_ties())
            verbose: The verbosity for just this score method; it will not
                update the class config.
            print_confusion_matrix: Print confusion matrix (overwritten to False if
                verbose=False)

        Returns:
            scores: A (float) score or a list of such scores if kwarg metric
                is a list
        """
        Y_p, Y, Y_s = self._get_predictions(data,
                                            break_ties=break_ties,
                                            return_probs=True,
                                            **kwargs)

        # Evaluate on the specified metrics
        return_list = isinstance(metric, list)
        metric_list = metric if isinstance(metric, list) else [metric]
        scores = []
        for metric in metric_list:
            score = metric_score(Y, Y_p, metric, probs=Y_s, ignore_in_gold=[0])
            scores.append(score)
            if verbose:
                if type(score) != list:
                    print(f"{metric.capitalize()}: {score:.7f}")
                else:
                    print(f"{metric.capitalize()}: {score}")

        # Optionally print confusion matrix
        if print_confusion_matrix and verbose:
            confusion_matrix(Y, Y_p, pretty_print=True)

        # If a single metric was given as a string (not list), return a float
        if len(scores) == 1 and not return_list:
            return scores[0]
        else:
            return scores
Example #2
0
               plot_title="Disease Associates Gene Dev PRC",
               metric='PR',
               font_size=16)

# In[21]:

label_model = LabelModel(k=2, seed=100)
label_model.train_model(validation_data[1][0],
                        n_epochs=1000,
                        verbose=False,
                        lr=0.01,
                        l2=2.067)
dev_predictions = convert_labels(label_model.predict(validation_data[1][1]),
                                 'categorical', 'onezero')
dev_marginals = label_model.predict_proba(validation_data[1][1])[:, 0]

# In[22]:

plt.rcParams.update({'font.size': 16})
plt.figure(figsize=(10, 6))
plot_predictions_histogram(dev_predictions,
                           candidate_dfs['dev'].curated_dsh.astype(int).values,
                           title="Prediction Histogram for Dev Set")

# In[23]:

confusion_matrix(
    convert_labels(candidate_dfs['dev'].curated_dsh.values, 'onezero',
                   'categorical'),
    convert_labels(dev_predictions, 'onezero', 'categorical'))
Example #3
0
    def _train(self, train_loader, loss_fn, X_dev=None, Y_dev=None):
        """The internal training routine called by train() after initial setup

        Args:
            train_loader: a torch DataLoader of X (data) and Y (labels) for
                the train split
            loss_fn: the loss function to minimize (maps *data -> loss)
            X_dev: the dev set model input
            Y_dev: the dev set target labels

        If either of X_dev or Y_dev is not provided, then no checkpointing or
        evaluation on the dev set will occur.
        """
        train_config = self.config["train_config"]
        evaluate_dev = X_dev is not None and Y_dev is not None

        # Set the optimizer
        optimizer_config = train_config["optimizer_config"]
        optimizer = self._set_optimizer(optimizer_config)

        # Set the lr scheduler
        scheduler_config = train_config["scheduler_config"]
        lr_scheduler = self._set_scheduler(scheduler_config, optimizer)

        # Create the checkpointer if applicable
        if evaluate_dev and train_config["checkpoint"]:
            checkpoint_config = train_config["checkpoint_config"]
            model_class = type(self).__name__
            checkpointer = Checkpointer(
                model_class, **checkpoint_config, verbose=self.config["verbose"]
            )

        # Train the model
        for epoch in range(train_config["n_epochs"]):
            epoch_loss = 0.0
            for data in train_loader:
                # Zero the parameter gradients
                optimizer.zero_grad()

                # Forward pass to calculate outputs
                loss = loss_fn(*data)
                if torch.isnan(loss):
                    msg = "Loss is NaN. Consider reducing learning rate."
                    raise Exception(msg)

                # Backward pass to calculate gradients
                loss.backward()

                # TODO: restore this once it has unit tests
                # Clip gradients
                # if grad_clip:
                #     torch.nn.utils.clip_grad_norm(
                #        self.net.parameters(), grad_clip)

                # Perform optimizer step
                optimizer.step()

                # Keep running sum of losses
                epoch_loss += loss.detach()

            # Calculate average loss per training example
            # Saving division until this stage protects against the potential
            # mistake of averaging batch losses when the last batch is an orphan
            train_loss = epoch_loss / len(train_loader.dataset)

            # Checkpoint performance on dev
            if evaluate_dev and (epoch % train_config["validation_freq"] == 0):
                val_metric = train_config["validation_metric"]
                dev_score = self.score(
                    X_dev, Y_dev, metric=val_metric, verbose=False
                )
                if train_config["checkpoint"]:
                    checkpointer.checkpoint(self, epoch, dev_score)

            # Apply learning rate scheduler
            if (
                lr_scheduler is not None
                and epoch + 1 >= scheduler_config["lr_freeze"]
            ):
                if scheduler_config["scheduler"] == "reduce_on_plateau":
                    if evaluate_dev:
                        lr_scheduler.step(dev_score)
                else:
                    lr_scheduler.step()

            # Report progress
            if self.config["verbose"] and (
                epoch % train_config["print_every"] == 0
                or epoch == train_config["n_epochs"] - 1
            ):
                msg = f"[E:{epoch}]\tTrain Loss: {train_loss:.3f}"
                if evaluate_dev:
                    msg += f"\tDev score: {dev_score:.3f}"
                print(msg)

        # Restore best model if applicable
        if evaluate_dev and train_config["checkpoint"]:
            checkpointer.restore(model=self)

        if self.config["verbose"]:
            print("Finished Training")

            if evaluate_dev:
                Y_p_dev = self.predict(X_dev)

                if not self.multitask:
                    print("Confusion Matrix (Dev)")
                    confusion_matrix(Y_p_dev, Y_dev, pretty_print=True)
# baseline majority:
"""
print(L_test)
Y_baseline = []
for row in L_test:
    print(row)
    Y_baseline.append(Counter(row).most_common()[0][0])
print(np.asarray(Y_baseline))
"""

Y_tes = label_model.predict(Ls[2])
pickling_on = open("data_encompassing/ar/ar_{}{}".format(flag0, flag), "wb")
print(Y_tes, type(Y_tes))
pickle.dump(Y_tes, pickling_on)

cm = confusion_matrix(Ys[1], Y_dev_p)

try:
    from metal.contrib.visualization.analysis import (
        plot_predictions_histogram,
        plot_probabilities_histogram,
    )

    plot_predictions_histogram(Y_tes, Ys[2], title="Label Distribution")

    Y_dev_ps = label_model.predict_proba(Ls[2])
    plot_probabilities_histogram(Y_dev_ps[:, 0],
                                 title="Probablistic Label Distribution")

except ModuleNotFoundError:
    print(
Example #5
0
    def train(self, X_train, Y_train, X_dev=None, Y_dev=None, **kwargs):
        self.config = recursive_merge_dicts(self.config, kwargs)
        train_config = self.config['train_config']

        Y_train = self._to_torch(Y_train)
        Y_dev = self._to_torch(Y_dev)

        if train_config['use_cuda']:
            raise NotImplementedError
            # TODO: fix this
            # X = X.cuda(self.gpu_id)
            # Y = Y.cuda(self.gpu_id)
            # TODO: put model on gpu

        # Make data loaders
        loader_config = train_config['data_loader_config']
        train_loader = self._make_data_loader(X_train, Y_train, loader_config)
        evaluate_dev = (X_dev is not None and Y_dev is not None)

        # Set the optimizer
        optimizer_config = train_config['optimizer_config']
        optimizer = self._set_optimizer(optimizer_config)

        # Set the lr scheduler
        scheduler_config = train_config['scheduler_config']
        lr_scheduler = self._set_scheduler(scheduler_config, optimizer)

        # Initialize the model
        self.reset()

        # Train the model
        for epoch in range(train_config['n_epochs']):
            epoch_loss = 0.0
            for i, data in enumerate(train_loader):
                X, Y = data

                # Zero the parameter gradients
                optimizer.zero_grad()

                # Forward pass to calculate outputs
                output = self.forward(X)
                loss = self._get_loss(output, Y)

                # Backward pass to calculate gradients
                loss.backward()

                # Clip gradients
                # if grad_clip:
                #     torch.nn.utils.clip_grad_norm(
                #        self.net.parameters(), grad_clip)

                # Perform optimizer step
                optimizer.step()

                # Keep running sum of losses
                epoch_loss += loss.detach() * X.shape[0]

            # Calculate average loss per training example
            # Saving division until this stage protects against the potential
            # mistake of averaging batch losses when the last batch is an orphan
            train_loss = epoch_loss / len(train_loader.dataset)

            if evaluate_dev:
                val_metric = train_config['validation_metric']
                dev_score = self.score(X_dev, Y_dev, metric=val_metric, 
                    verbose=False)
            
            # Apply learning rate scheduler
            if (lr_scheduler is not None 
                and epoch + 1 >= scheduler_config['lr_freeze']):
                if scheduler_config['scheduler'] == 'reduce_on_plateau':
                    if evaluate_dev:
                        lr_scheduler.step(dev_score)
                else:
                    lr_scheduler.step()

            # Report progress
            if (self.config['verbose'] and 
                (epoch % train_config['print_every'] == 0 
                or epoch == train_config['n_epochs'] - 1)):
                msg = f'[E:{epoch+1}]\tTrain Loss: {train_loss:.3f}'
                if evaluate_dev:
                    msg += f'\tDev score: {dev_score:.3f}'
                print(msg)

        if self.config['verbose']:
            print('Finished Training')
            
            if self.config['show_plots']:
                if self.k == 2:
                    Y_p_train = self.predict_proba(X_train)
                    plot_probabilities_histogram(Y_p_train[:, 0], 
                        title="Training Set Predictions")

            if X_dev is not None and Y_dev is not None:
                Y_ph_dev = self.predict(X_dev)

                print("Confusion Matrix (Dev)")
                mat = confusion_matrix(Y_ph_dev, Y_dev, pretty_print=True)