Esempio n. 1
0
    def update(self, preds: torch.Tensor, target: torch.Tensor):
        do_print = self.print_every is not None and not self.counter % self.print_every
        if do_print:
            utils.hprint(f'BetterAccuracy is set to print every {self.print_every} and you at {self.counter}:')
            print(f"BetterAccuracy: preds: \n{preds}")
            print(f"BetterAccuracy: target: \n{target}")
            print()
        self.counter += 1

        # preds, target = _input_format_classification(preds, target, self.threshold)
        assert preds.shape == target.shape, f'preds.shape = {preds.shape} != target.shape = {target.shape}'

        preds = preds.argmax(dim=1)
        target = target.argmax(dim=1)

        if do_print:
            print(f"BetterAccuracy: preds post argmax: \n{preds}")
            print(f"BetterAccuracy: target post argmax: \n{target}")
            print()

        assert target.dim() == 1, f'got target of shape {target.shape}'

        eqs = preds.eq(target)

        if do_print:
            print(
                f"BetterAccuracy: new_correct: {eqs.sum()}, "
                f" numel: {target.numel()}, "
                f" shape[0]: {target.shape[0]}, "
                f" ignore: {target.eq(self.Y_VALUE_TO_IGNORE).sum()}"
            )

        self.correct = self.correct + torch.sum(eqs)
        self.total = self.total + target.shape[0]
Esempio n. 2
0
    def on_train_start(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
        field_to_class_counts = self.get_field_to_class_counts(dataloader=pl_module.train_dataloader())
        if self.verbose:
            utils.hprint('ClassCounterCallback Class Counts:')
            utils.print_dict(field_to_class_counts)
            print()

        if self.hp is None:
            if self.verbose:
                print(f'  Not setting head_params.pos_class_weights because you did not pass hp to my init')
        else:
            if self.hp.type != 'weighted':
                raise NotImplementedError(
                    f'hp == {self.hp} but this is only implemented for WeightedHeadParams'
                )
            for field_name, class_counts_df in field_to_class_counts.items():
                if field_name not in self.field_name_to_head_name:
                    # we might not be using all fields in heads
                    continue
                head_name = self.field_name_to_head_name[field_name]
                head = pl_module.head.heads[head_name]

                if head.did_set_pos_class_weights:
                    pos_class_weights = head.pos_class_weights
                    if self.verbose:
                        weights_str = ', '.join([f'{e:.2f}' for e in pos_class_weights])
                        print(f'  head_params["{field_name}"].pos_class_weights was already set to [{weights_str}]')
                        print()

                else:
                    pos_class_weights = class_counts_df.loc[self.INV_PORTIONS].values
                    max_inds = np.where(pos_class_weights > self.max_pos_class_weight)[0]
                    pos_class_weights[max_inds] = self.max_pos_class_weight

                    head.set_pos_class_weights(
                        torch.tensor(pos_class_weights, dtype=torch.float, device=pl_module.device)
                    )
                    if self.verbose:
                        weights_str = ', '.join([f'{e:.2f}' for e in pos_class_weights])
                        print(f'  Setting head_params["{field_name}"].pos_class_weights = [{weights_str}]')
                        print()

        pl_module.log_lossmetrics_dict(
            phase=utils.Phase.train,
            d={self.CLASS_COUNTS: field_to_class_counts},
            do_log_to_progbar=False,
        )
Esempio n. 3
0
    def run(self, fast_dev_run=False, use_gpus=False, log_to_file=False):
        search_dict = self.search_params.to_ray_tune_search_dict()
        # see tune.utils.UtilMonitor
        search_dict['log_sys_usage'] = True

        # noinspection PyTypeChecker
        analysis = tune.run(
            run_or_experiment=self._get_train_fn(fast_dev_run=fast_dev_run,
                                                 include_gpus=use_gpus),
            name=self.search_params.exp.get_project_exp_name(),
            stop=self.get_tune_stopper(self.search_params.opt.num_epochs),
            config=search_dict,
            resources_per_trial=self.get_resources_per_trial(
                self.search_params, include_gpu=use_gpus),
            num_samples=self.search_params.tune.num_hp_samples,
            sync_config=tune.SyncConfig(
                upload_dir=self.search_params.metrics.output_dir),
            loggers=self.get_tune_loggers(),
            log_to_file=log_to_file,
            keep_checkpoints_num=2,
            checkpoint_score_attr=
            f'{self.search_params.opt.search_mode}-{self.search_params.opt.search_metric}',
            fail_fast=False,
            scheduler=self.get_tune_scheduler(self.search_params),
            verbose=2,
            progress_reporter=self.get_cli_reporter(),
            reuse_actors=False,
        )

        utils.hprint("done with tune.run")

        param_hash = self.search_params.get_short_hash(num_chars=8)
        analysis_file = self.search_params.metrics.output_dir / f'tune_analysis_{param_hash}.pkl'
        print(f"Saving {analysis_file}")
        utils.save_pickle(analysis_file, analysis)

        best_trial = analysis.get_best_trial(
            self.search_params.opt.search_metric,
            self.search_params.opt.search_mode, "last-5-avg")
        print(f'best_trial.last_result: {best_trial.last_result}')
        print("Best trial config: {}".format(best_trial.config))
        print("Best trial final search_metric: {}".format(
            best_trial.last_result[self.search_params.opt.search_metric]))
Esempio n. 4
0
    def _train_fn(self,
                  config: Dict,
                  checkpoint_dir=None,
                  fast_dev_run=False,
                  include_gpus=False):
        utils.hprint('Starting train function with config:')
        utils.print_dict(config)

        del config['tune']
        hp = self._param_class.from_dict(config)
        assert isinstance(hp, self._param_class)

        if checkpoint_dir:
            # see https://docs.ray.io/en/master/tune/user-guide.html#checkpointing
            raise NotImplementedError(
                f"Got checkpoint_dir in trian_fn: {checkpoint_dir}")

        net = self._factored_lightning_module_class.from_hp(hp=hp)

        utils.set_seeds(hp.data.seed)

        # noinspection PyTypeChecker
        trainer = pl.Trainer(
            logger=torch_mod.get_pl_logger(hp=hp.exp,
                                           tune=tune,
                                           offline_mode=fast_dev_run),
            default_root_dir=tune.get_trial_dir(),
            callbacks=self.extra_pl_callbacks + self.get_tune_callbacks(),
            max_epochs=hp.opt.num_epochs,
            gpus=hp.data.num_gpus if include_gpus else None,
            weights_summary='full',
            fast_dev_run=fast_dev_run,
            accumulate_grad_batches=1,
            profiler='simple',
            deterministic=True,
            log_every_n_steps=hp.metrics.num_steps_per_metric_log,
        )
        fit_out = trainer.fit(net, datamodule=net.dm)

        utils.print_dict(config)
        utils.hprint('Done with tune_runner._train_fn')

        return fit_out
Esempio n. 5
0
def run(
    net: pl.LightningModule,
    dm: pl.LightningDataModule,
    hp: ModelBertConvTransTClass.Params,
    fast_dev_run=False,
    do_find_lr=False,
):
    print("model run about to create trainer")
    trainer = pl.Trainer(
        logger=True if fast_dev_run else torch_mod.get_pl_logger(hp.exp),
        default_root_dir=hp.metrics.output_dir,
        callbacks=[metrics_mod.CounterTimerCallback()],
        max_epochs=hp.opt.num_epochs,
        gpus=hp.data.num_gpus,
        weights_summary='full',
        fast_dev_run=fast_dev_run,
        accumulate_grad_batches=1,
        profiler=True,
        deterministic=True,
        auto_lr_find=do_find_lr,
        log_every_n_steps=hp.metrics.num_steps_per_metric_log,
    )
    print("model run done creating trainer")

    if do_find_lr:
        utils.hprint("Starting trainer.tune:")
        lr_tune_out = trainer.tune(net, datamodule=dm)
        print(f'  Tune out: {lr_tune_out}')
    else:
        utils.hprint("Starting trainer.fit:")
        print(f'  Dataset file: {hp.data.get_dataset_file()}')
        trainer.fit(net, datamodule=dm)

    utils.hprint('Done with model run fn')
Esempio n. 6
0
    def _train_fn(self,
                  config: Dict,
                  checkpoint_dir=None,
                  fast_dev_run=False,
                  include_gpus=False):
        utils.hprint('Starting train function with config:')
        utils.print_dict(config)
        print()

        utils.set_pandas_disp(width=200)

        hp = self._model_param_class.from_dict(config)
        assert isinstance(hp, self._model_param_class)
        print('  hp:', hp)

        if checkpoint_dir:
            # see https://docs.ray.io/en/master/tune/user-guide.html#checkpointing
            raise NotImplementedError(
                f"Got checkpoint_dir in trian_fn: {checkpoint_dir}")

        utils.hprint("About to create net in TuneRunner")
        net = hp.build()
        # import torch.autograd.profiler as profiler
        # with profiler.profile(record_shapes=True, use_cuda=True, profile_memory=True) as prof:
        #     net = self._factored_lightning_module_class.from_hp(hp=hp)
        # print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=1000))

        utils.set_seeds(hp.data.seed)

        # noinspection PyTypeChecker
        trainer = pl.Trainer(
            logger=logs_mod.get_pl_logger(hp=hp.exp, tune=tune),
            default_root_dir=tune.get_trial_dir(),
            callbacks=self.extra_pl_callbacks +
            self.get_pl_callbacks_for_tune(),
            max_epochs=hp.opt.num_epochs,
            gpus=hp.data.num_gpus if include_gpus else None,
            weights_summary='full',
            fast_dev_run=fast_dev_run,
            accumulate_grad_batches=1,
            profiler='simple',
            deterministic=True,
            log_every_n_steps=hp.logs.num_steps_per_metric_log,
            log_gpu_memory=hp.logs.log_gpu_memory,
        )
        utils.hprint('About to start tune_runner\'s trainer.fit...')
        fit_out = trainer.fit(net, datamodule=net.dm)
        utils.hprint('Done with tune_runner._train_fn')

        return fit_out
Esempio n. 7
0
    def run(self, fast_dev_run=False, use_gpus=False):
        utils.set_seeds(self.search_params.data.seed)

        search_dict = self.search_params.to_ray_tune_search_dict()
        # see tune.utils.UtilMonitor
        search_dict['log_sys_usage'] = True

        output_str = str(self.search_params.logs.output_dir)
        if output_str.startswith('s3://') or output_str.startswith(
                'gs://') or output_str.startswith('hdfs://'):
            sync_config = tune.SyncConfig(
                upload_dir=self.search_params.logs.output_dir)
        else:
            sync_config = None

        analysis = tune.run(
            run_or_experiment=self._get_train_fn(fast_dev_run=fast_dev_run,
                                                 include_gpus=use_gpus),
            name=self.search_params.exp.get_project_exp_name(),
            stop=self.get_tune_stopper(self.search_params.opt.num_epochs),
            config=search_dict,
            resources_per_trial=self.get_resources_per_trial(
                self.search_params, include_gpu=use_gpus),
            num_samples=self.tune_hp.num_hp_samples,
            sync_config=sync_config,
            loggers=self.get_tune_loggers(),
            log_to_file=self.tune_hp.log_to_file
            and not self.tune_hp.ray_local_mode,
            keep_checkpoints_num=2,
            checkpoint_score_attr=
            f'{self.search_params.opt.search_mode}-{self.search_params.opt.search_metric}',
            fail_fast=False,
            scheduler=self.get_tune_scheduler(self.search_params,
                                              self.tune_hp),
            verbose=2,
            progress_reporter=self.get_cli_reporter(),
            reuse_actors=False,
        )

        utils.hprint("done with tune.run")

        param_hash = self.search_params.get_short_hash(num_chars=8)
        analysis_file = self.search_params.logs.output_dir / f'tune_analysis_{param_hash}.cloudpickle'
        print(f"Saving {analysis_file}")
        utils.save_cloudpickle(analysis_file, analysis)

        best_trial = analysis.get_best_trial(
            self.search_params.opt.search_metric,
            self.search_params.opt.search_mode, "last-5-avg")
        utils.hprint('best_trial.last_result', do_include_pre_break_line=True)
        utils.print_dict(best_trial.last_result)

        utils.hprint('best_trial.config', do_include_pre_break_line=True)
        utils.print_dict(best_trial.config)
Esempio n. 8
0
def run(
        net: pl.LightningModule,
        hp: TotalParams,
        fast_dev_run=False,
        do_find_lr=False,
        callbacks=None,
):
    utils.set_seeds(hp.data.seed)
    utils.set_pandas_disp()

    if callbacks is None:
        callbacks = [
            logs_mod.CounterTimerLrCallback(),
            logs_mod.VocabLengthCallback(),
        ]

    print("model run about to create trainer")
    trainer = pl.Trainer(
        logger=True if fast_dev_run else logs_mod.get_pl_logger(hp.exp),
        default_root_dir=hp.logs.output_dir,
        callbacks=callbacks,
        max_epochs=hp.opt.num_epochs,
        gpus=hp.data.num_gpus,
        weights_summary='full',
        fast_dev_run=fast_dev_run,
        accumulate_grad_batches=1,
        profiler='simple',
        deterministic=True,
        auto_lr_find=do_find_lr,
        log_every_n_steps=hp.logs.num_steps_per_metric_log,
    )
    print("model run done creating trainer")

    if do_find_lr:
        utils.hprint("Starting trainer.tune:")
        lr_tune_out = trainer.tune(net, datamodule=net.dm)
        print(f'  Tune out: {lr_tune_out}')
    else:
        utils.hprint("Starting trainer.fit:")
        print(f'  Dataset file: {hp.data.dataset_file}')
        trainer.fit(net, datamodule=net.dm)

    utils.hprint('Done with model run fn')
Esempio n. 9
0
    hp.heads.num_features = params.Discrete([32, 64, 128, 256])
    hp.heads.num_layers = params.Integer(2, 5)
    hp.heads.num_groups = params.Discrete([8, 16, 32, 64])
    hp.heads.num_blocks_per_residual = params.Integer(1, 5)
    hp.heads.num_blocks_per_dropout = params.Integer(1, 5)
    hp.heads.requires_grad = True

    hp.tune = TuneRunner.TuneParams()
    hp.tune.asha_grace_period = 16
    hp.tune.asha_reduction_factor = 2
    hp.tune.num_hp_samples = 100

    hostname = socket.gethostname()
    is_local_run = hostname.endswith('.local')

    # noinspection PyTypeChecker
    tune_runner = TuneRunner(
        search_params=hp,
        factored_lightning_module_class=model_bert_trans_conv_tclass.
        ModelBertConvTransTClass,
        extra_pl_callbacks=None,
        ray_local_mode=False,
    )
    tune_runner.run(
        fast_dev_run=False,
        use_gpus=not is_local_run,
        log_to_file=True,
    )

    utils.hprint("done with tune_runner.run")
Esempio n. 10
0
    hp.trans.num_layers = 6
    hp.trans.num_query_features = None
    hp.trans.fc_dim_mult = 2

    hp.fc.num_features = 128
    hp.fc.num_layers = 2
    hp.fc.num_groups = 16
    hp.fc.num_blocks_per_residual = 2
    hp.fc.num_blocks_per_dropout = 2
    hp.fc.requires_grad = True

    hp.heads.num_features = 128
    hp.heads.num_layers = 4
    hp.heads.num_groups = 16
    hp.heads.num_blocks_per_residual = 2
    hp.heads.num_blocks_per_dropout = 2
    hp.heads.requires_grad = True

    dm = data.TablestakesDataModule(hp.data)
    net = ModelBertConvTransTClass(
        hp=hp,
        data_module=dm,
        metrics_tracker=metrics_mod.ClassificationMetricsTracker(hp.metrics),
        opt=factored.OptimizersMaker(hp.opt),
    )

    utils.hprint('About to start model run:')
    utils.print_dict(hp.to_dict())

    run(net, dm, hp, fast_dev_run, do_find_lr=False)
Esempio n. 11
0
        },
    )

    hp.verbose = False

    tune_hp = TuneParams()
    tune_hp.asha_grace_period = 4
    tune_hp.asha_reduction_factor = 2
    tune_hp.num_hp_samples = 2
    tune_hp.log_to_file = False
    tune_hp.ray_local_mode = False

    hostname = socket.gethostname()
    is_local_run = hostname.endswith('.local')

    utils.hprint('About to start model run:')
    utils.print_dict(hp.to_dict())

    tune_runner = TuneRunner(
        model_hp=hp,
        tune_hp=tune_hp,
        # factored_lightning_module_class=ts_model.TablestakesBertConvTransTClassModel,
        extra_pl_callbacks=None,
    )
    tune_runner.run(
        fast_dev_run=False,
        use_gpus=not is_local_run,
    )

    utils.hprint("done with tune_runner.run")