Esempio n. 1
0
    def _evaluate(self, dataloader):
        if self._trainer is None:
            raise Exception("Must call fit first")
        if inspect.isclass(self._loss) and issubclass(self._loss, TLoss):
            # it is the loss class
            criterion = self._loss()
        elif isinstance(self._loss, TLoss):
            # it is the loss instance
            criterion = self._loss
        elif callable(self._loss):
            # it ts the loss create function
            criterion = self._loss({})

        model = self.get_model()
        model.eval()
        metric_meters = AverageMeterCollection()

        with torch.no_grad():
            for batch_idx, batch in enumerate(dataloader):
                batch_info = {"batch_idx": batch_idx}
                # unpack features into list to support multiple inputs model
                *features, target = batch
                output = model(*features)
                loss = criterion(output, target)
                num_samples = target.size(0)
                metrics = {"val_loss": loss.item(), "num_samples": num_samples}
                metric_meters.update(metrics)

        return metric_meters.summary()
Esempio n. 2
0
    def validate(self, val_iterator, info):
        """Runs one standard validation pass over the val_iterator.

        This will call ``model.eval()`` and ``torch.no_grad`` when iterating
        over the validation dataloader.

        If overriding this method, you can access model, criterion via
        ``self.model`` and ``self.criterion``. You also do not need to call
        ``validate_batch`` if overriding this method.

        Args:
            val_iterator (iter): Iterable constructed from the
                validation dataloader.
            info: (dict): Dictionary for information to be used for custom
                validation operations.

        Returns:
            A dict of metrics from the evaluation.
                By default, returns "val_accuracy" and "val_loss"
                which is computed by aggregating "loss" and "correct" values
                from ``validate_batch`` and dividing it by the sum of
                ``num_samples`` from all calls to ``self.validate_batch``.
        """
        metric_meters = AverageMeterCollection()

        # switch to evaluate mode
        self.model.eval()
        with torch.no_grad():
            for batch_idx, batch in enumerate(val_iterator):
                batch_info = {"batch_idx": batch_idx}
                batch_info.update(info)
                metrics = self.validate_batch(batch, batch_info)
                metric_meters.update(metrics, n=metrics.pop(NUM_SAMPLES, 1))

        return metric_meters.summary()
Esempio n. 3
0
 def train_epoch(self, iterator, info):
     meter_collection = AverageMeterCollection()
     iter_tput = []
     model = self.model
     # for batch_idx,batch in enumerate(iterator):
     for step, (input_nodes, seeds, blocks) in enumerate(iterator):
         tic_step = time.time()
         # do some train
         optimizer = self.optimizer
         device = 0
         if self.use_gpu:
             blocks = [block.int().to(device) for block in blocks]
         batch_inputs = blocks[0].srcdata["features"]
         batch_labels = blocks[-1].dstdata["labels"]
         batch_pred = model(blocks, batch_inputs)
         loss = F.nll_loss(batch_pred, batch_labels)
         optimizer.zero_grad()
         loss.backward()
         optimizer.step()
         iter_tput.append(len(seeds) / (time.time() - tic_step))
         if step % 20 == 0:
             acc = compute_acc(batch_pred, batch_labels)
             gpu_mem_alloc = torch.cuda.max_memory_allocated(
             ) / 1000000 if torch.cuda.is_available() else 0
             print("Epoch {:05d} | Step {:05d} | Loss {:.4f} | "
                   "Train Acc {:.4f} | Speed (samples/sec) {:.4f} | GPU "
                   "{:.1f} MB".format(info["epoch_idx"] + 1, step,
                                      loss.item(), acc.item(),
                                      np.mean(iter_tput[3:]),
                                      gpu_mem_alloc))
     status = meter_collection.summary()
     return status
Esempio n. 4
0
    def evaluate(self, df):
        super(TorchEstimator, self).evaluate(df)
        if self._trainer is None:
            raise Exception("Must call fit first")
        pdf = df.toPandas()
        dataset = PandasDataset(pdf, self._feature_columns,
                                self._feature_shapes, self._feature_types,
                                self._label_column, self._label_type)
        dataloader = torch.utils.data.DataLoader(dataset,
                                                 self._batch_size,
                                                 shuffle=self._shuffle)

        if inspect.isclass(self._loss) and issubclass(self._loss, TLoss):
            # it is the loss class
            criterion = self._loss()
        elif isinstance(self._loss, TLoss):
            # it is the loss instance
            criterion = self._loss
        elif callable(self._loss):
            # it ts the loss create function
            criterion = self._loss({})

        model = self.get_model()
        model.eval()
        metric_meters = AverageMeterCollection()

        with torch.no_grad():
            for batch_idx, batch in enumerate(dataloader):
                batch_info = {"batch_idx": batch_idx}
                # unpack features into list to support multiple inputs model
                *features, target = batch
                output = model(*features)
                loss = criterion(output, target)
                _, predicted = torch.max(output.data, 1)
                num_correct = (predicted == target).sum().item()
                num_samples = target.size(0)
                metrics = {
                    "val_loss": loss.item(),
                    "val_accuracy": num_correct / num_samples,
                    "num_samples": num_samples
                }
                metric_meters.update(metrics)

        return metric_meters.summary()
Esempio n. 5
0
    def validate(self, val_iterator, info):
        self.model.zero_grad()
        self.model.eval()

        torch.set_grad_enabled(False)

        model = self.get_model()
        if self.is_function_implemented("on_validation_epoch_start", model):
            model.on_validation_epoch_start()

        val_outputs = []
        for batch_idx, batch in enumerate(val_iterator):
            batch_info = {"batch_idx": batch_idx}
            batch_info.update(info)
            batch_output = self.validate_batch(batch, batch_info)
            if batch_output is not None:
                val_outputs.append(batch_output)

        processed_outputs = None
        if self.is_overridden("validation_epoch_end", model):
            raw_outputs = [vo["raw_output"] for vo in val_outputs]
            processed_outputs = model.training_epoch_end(raw_outputs)

        if processed_outputs is not None:
            if isinstance(processed_outputs, torch.Tensor):
                return_output = {"val_loss": processed_outputs}
            elif isinstance(processed_outputs, Result):
                raise ValueError("Result objects are not supported. Please "
                                 "return a dictionary instead.")
            elif isinstance(processed_outputs, dict):
                return_output = processed_outputs
            else:
                raise TypeError("validation_epoch_end returned an invalid "
                                "type. It must return a Tensor, Result, "
                                "or dict.")
        else:
            # User did not override training_epoch_end
            assert isinstance(val_outputs, list)
            # Use AverageMeterCollection util to reduce results.
            meter_collection = AverageMeterCollection()
            for v in val_outputs:
                num_samples = v.pop(NUM_SAMPLES, 1)
                raw_output = v["raw_output"]
                if isinstance(raw_output, dict):
                    meter_collection.update(raw_output, num_samples)
                elif isinstance(raw_output, torch.Tensor):
                    meter_collection.update({
                        "val_loss": raw_output.item()
                    }, num_samples)
                return_output = meter_collection.summary()

        if self.is_function_implemented("on_validation_epoch_end", model):
            model.on_validation_epoch_end()

        # Set back to True so training will work.
        torch.set_grad_enabled(True)

        return return_output
Esempio n. 6
0
    def validate(self, val_iterator, info=None):
        """Runs one standard validation pass over the val_iterator.

        This will call ``model.eval()`` and ``torch.no_grad`` when iterating
        over the validation dataloader.

        You also do not need to call ``validate_batch`` if overriding this
        method.

        Args:
            val_iterator (iter): Iterable constructed from the
                validation dataloader.
            info: (Optional[dict]): Dictionary for information to be used for
                custom validation operations.

        Returns:
            A dict of metrics from the evaluation.
                By default, returns "val_accuracy" and "val_loss"
                which is computed by aggregating "loss" and "correct" values
                from ``validate_batch`` and dividing it by the sum of
                ``num_samples`` from all calls to ``self.validate_batch``.
        """
        if not hasattr(self, "model"):
            raise RuntimeError("Either set self.model in setup function or "
                               "override this method to implement a custom "
                               "validation loop.")

        info = info or {}
        model = self.model
        metric_meters = AverageMeterCollection()

        # switch to evaluate mode
        model.eval()
        with torch.no_grad():
            for batch_idx, batch in enumerate(val_iterator):
                batch_info = {"batch_idx": batch_idx}
                batch_info.update(info)
                metrics = self.validate_batch(batch, batch_info)
                metric_meters.update(metrics, n=metrics.pop(NUM_SAMPLES, 1))

        return metric_meters.summary()
Esempio n. 7
0
    def train_epoch(self, iterator, info):
        """Runs one standard training pass over the training dataloader.

        By default, this method will iterate over the given iterator and
        call ``self.train_batch`` over each batch. If ``scheduler_step_freq``
        is set, this default method will also step the scheduler accordingly.

        You do not need to call ``train_batch`` in this method if you plan
        to implement a custom optimization/training routine here.

        You may find ``ray.util.sgd.utils.AverageMeterCollection`` useful
        when overriding this method. See example below:

        .. code-block:: python

            def train_epoch(self, ...):
                meter_collection = AverageMeterCollection()
                self.model.train()
                for batch in iterator:
                    # do some processing
                    metrics = {"metric_1": 1, "metric_2": 3} # dict of metrics

                    # This keeps track of all metrics across multiple batches
                    meter_collection.update(metrics, n=len(batch))

                # Returns stats of the meters.
                stats = meter_collection.summary()
                return stats


        Args:
            iterator (iter): Iterator over the training data for the entire
                epoch. This iterator is expected to be entirely consumed.
            info (dict): Dictionary for information to be used for custom
                training operations.

        Returns:
            A dict of metrics from training.
        """
        if not hasattr(self, "model"):
            raise RuntimeError("Either set self.model in setup function or "
                               "override this method to implement a custom "
                               "training loop.")
        model = self.model
        scheduler = None
        if hasattr(self, "scheduler"):
            scheduler = self.scheduler

        if self.use_tqdm and self.world_rank == 0:
            desc = ""
            if info is not None and "epoch_idx" in info:
                if "num_epochs" in info:
                    desc = f"{info['epoch_idx'] + 1}/{info['num_epochs']}e"
                else:
                    desc = f"{info['epoch_idx'] + 1}e"

            # TODO: Implement len for Dataset?
            total = info[NUM_STEPS]
            if total is None:
                if hasattr(iterator, "__len__"):
                    total = len(iterator)

            _progress_bar = tqdm(total=total,
                                 desc=desc,
                                 unit="batch",
                                 leave=False)

        metric_meters = AverageMeterCollection()

        model.train()
        for batch_idx, batch in enumerate(iterator):
            batch_info = {
                "batch_idx": batch_idx,
                "global_step": self.global_step
            }
            batch_info.update(info)
            metrics = self.train_batch(batch, batch_info=batch_info)

            if self.use_tqdm and self.world_rank == 0:
                _progress_bar.n = batch_idx + 1
                postfix = {}
                if "train_loss" in metrics:
                    postfix.update(loss=metrics["train_loss"])
                _progress_bar.set_postfix(postfix)

            if scheduler and self.scheduler_step_freq == SCHEDULER_STEP_BATCH:
                scheduler.step()

            metric_meters.update(metrics, n=metrics.pop(NUM_SAMPLES, 1))
            self.global_step += 1

        if scheduler and self.scheduler_step_freq == SCHEDULER_STEP_EPOCH:
            scheduler.step()

        return metric_meters.summary()
Esempio n. 8
0
    def train_epoch(self, iterator, info):
        model = self.get_model()

        # Enable train mode.
        self.model.train()

        # Enable gradients.
        torch.set_grad_enabled(True)

        if self.is_function_implemented("on_train_epoch_start", model):
            model.on_train_epoch_start()

        if self.use_tqdm and self.world_rank == 0:
            desc = ""
            if info is not None and "epoch_idx" in info:
                if "num_epochs" in info:
                    desc = f"{info['epoch_idx'] + 1}/{info['num_epochs']}e"
                else:
                    desc = f"{info['epoch_idx'] + 1}e"

            # TODO: Implement len for Dataset?
            total = info[NUM_STEPS]
            if total is None:
                if hasattr(iterator, "__len__"):
                    total = len(iterator)

            _progress_bar = tqdm(total=total,
                                 desc=desc,
                                 unit="batch",
                                 leave=False)

        # Output for each batch.
        epoch_outputs = []

        for batch_idx, batch in enumerate(iterator):
            batch_info = {
                "batch_idx": batch_idx,
                "global_step": self.global_step
            }
            batch_info.update(info)
            batch_output = self.train_batch(batch, batch_info=batch_info)
            # batch output for each optimizer.
            epoch_outputs.append(batch_output)

            should_stop = batch_output["signal"] == -1

            if self.use_tqdm and self.world_rank == 0:
                _progress_bar.n = batch_idx + 1
                postfix = {}
                if "training_loss" in batch_output:
                    postfix.update(loss=batch_output["training_loss"])
                _progress_bar.set_postfix(postfix)

            for s_dict, scheduler in zip(self.scheduler_dicts,
                                         self.schedulers):
                if s_dict["interval"] == SCHEDULER_STEP_BATCH:
                    scheduler.step()

            self.global_step += 1

            if should_stop:
                break

        processed_outputs = None
        if is_overridden("training_epoch_end", model):
            raw_outputs = [eo["raw_output"] for eo in epoch_outputs]
            processed_outputs = model.training_epoch_end(raw_outputs)

        if processed_outputs is not None:
            if isinstance(processed_outputs, torch.Tensor):
                return_output = {"train_loss": processed_outputs}
            elif isinstance(processed_outputs, Result):
                raise ValueError("Result objects are not supported. Please "
                                 "return a dictionary instead.")
            elif isinstance(processed_outputs, dict):
                return_output = processed_outputs
            else:
                raise TypeError("training_epoch_end returned an invalid "
                                "type. It must return a Tensor, Result, "
                                "or dict.")
        else:
            # User did not override training_epoch_end
            assert isinstance(epoch_outputs, list)
            # Use AverageMeterCollection util to reduce results.
            meter_collection = AverageMeterCollection()
            for o in epoch_outputs:
                num_samples = o.pop(NUM_SAMPLES, 1)
                raw_output = o["raw_output"]
                if isinstance(raw_output, dict):
                    meter_collection.update(raw_output, num_samples)
                elif isinstance(raw_output, torch.Tensor):
                    meter_collection.update({"train_loss": o["training_loss"]},
                                            num_samples)
                return_output = meter_collection.summary()

        if self.is_function_implemented("on_train_epoch_end", model):
            model.on_train_epoch_end(
                [eo.get("raw_output") for eo in epoch_outputs])

        for s_dict, scheduler in zip(self.scheduler_dicts, self.schedulers):
            if s_dict["interval"] == SCHEDULER_STEP_EPOCH:
                scheduler.step()

        return return_output
Esempio n. 9
0
    def train_epoch(self, iterator, info):
        """Runs one standard training pass over the training dataloader.

        By default, this method will iterate over the given iterator and
        call ``self.train_batch`` over each batch. If ``scheduler_step_freq``
        is set, this default method will also step the scheduler accordingly.

        You do not need to call ``train_batch`` in this method if you plan
        to implement a custom optimization/training routine here.

        You may find ``ray.util.sgd.utils.AverageMeterCollection`` useful
        when overriding this method. See example below:

        .. code-block:: python

            def train_epoch(self, ...):
                meter_collection = AverageMeterCollection()
                self.model.train()
                for batch in iterator:
                    # do some processing
                    metrics = {"metric_1": 1, "metric_2": 3} # dict of metrics

                    # This keeps track of all metrics across multiple batches
                    meter_collection.update(metrics, n=len(batch))

                # Returns stats of the meters.
                stats = meter_collection.summary()
                return stats


        Args:
            iterator (iter): Iterator over the training data for the entire
                epoch. This iterator is expected to be entirely consumed.
            info (dict): Dictionary for information to be used for custom
                training operations.

        Returns:
            A dict of metrics from training.
        """
        for r in self.reporters:
            r.on_epoch_begin(info, self)

        metric_meters = AverageMeterCollection()

        self.model.train()
        for batch_idx, batch in enumerate(iterator):
            batch_info = {
                "batch_idx": batch_idx,
                "global_step": self.global_step
            }
            batch_info.update(info)
            metrics = self.train_batch(batch, batch_info=batch_info)

            for r in self.reporters:
                r.on_batch_end(batch_info, metrics, self)

            if self.scheduler and batch_info.get(
                    SCHEDULER_STEP) == SCHEDULER_STEP_BATCH:
                self.scheduler.step()

            metric_meters.update(metrics, n=metrics.pop(NUM_SAMPLES, 1))
            self.global_step += 1

        if self.scheduler and info.get(SCHEDULER_STEP) == SCHEDULER_STEP_EPOCH:
            self.scheduler.step()

        return metric_meters.summary()
Esempio n. 10
0
    def validate(self, validation_loader, info):
        meter_collection = AverageMeterCollection()
        model = self.model
        n_layers = self.config["n_layers"]
        n_hidden = self.config["n_hidden"]
        n_heads = self.config["n_heads"]
        batch_size = self.config["batch_size"]
        num_workers = self.config["sampling_num_workers"]
        g = self.g
        train_nid = self.train_nid
        val_nid = self.val_nid
        test_nid = self.test_nid
        device = 0
        model.eval()
        with torch.no_grad():
            x = g.ndata["features"]
            for i, layer in enumerate(self.convs):
                if i < n_layers - 1:
                    y = torch.zeros(
                        g.number_of_nodes(), n_hidden * n_heads
                        if i != len(self.convs) - 1 else self.n_classes)
                else:
                    y = torch.zeros(
                        g.number_of_nodes(), n_hidden
                        if i != len(self.convs) - 1 else self.n_classes)
                sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1)
                collator = NodeCollator(g, torch.arange(g.number_of_nodes()),
                                        sampler)
                dataloader = DataLoader(collator.dataset,
                                        collate_fn=collator.collate,
                                        batch_size=batch_size,
                                        shuffle=False,
                                        drop_last=False,
                                        num_workers=num_workers)
                for input_nodes, output_nodes, blocks in dataloader:
                    block = blocks[0]
                    # print("block:",block)
                    block = block.int().to(device)
                    h = x[input_nodes].to(device)
                    h_dst = x[output_nodes].to(device)
                    if i != len(self.convs) - 1:
                        h = layer(block, (h, h_dst)).flatten(1)
                    else:
                        h = layer(block, (h, h_dst)).mean(1)
                        h = h.log_softmax(dim=-1)
                    y[output_nodes] = h.cpu()
                x = y
            pred = y
        labels = g.ndata["labels"]
        _, val_acc, test_acc = compute_acc(pred[train_nid], labels[
            train_nid]), compute_acc(pred[val_nid], labels[val_nid]), \
            compute_acc(pred[test_nid], labels[test_nid])

        metrics = {
            "num_samples": pred.size(0),
            "val_acc": val_acc.item(),
            "test_acc": test_acc.item()
        }
        meter_collection.update(metrics, n=metrics.pop("num_samples", 1))
        status = meter_collection.summary()
        return status