Ejemplo n.º 1
0
class Experiment():
    def __init__(self, api_key=None, **kwargs):
        self._exp = None
        self._id = uuid4().hex
        if api_key:
            self._exp = CometExperiment(api_key,
                                        log_code=False,
                                        auto_param_logging=False,
                                        auto_metric_logging=False,
                                        **kwargs)
            self._id = self._exp.get_key()

    def log_metric(self, name, value, step=None, epoch=None):
        if self._exp:
            self._exp.log_metric(name, value, step, epoch)

    def log_epoch_end(self, epoch_cnt, step=None):
        if self._exp:
            self._exp.log_epoch_end(epoch_cnt, step=step)

    def log_parameters(self, hp):
        if self._exp:
            self._exp.log_parameters(flatten(hp, reducer='underscore'))

    @property
    def id(self):
        return self._id[:12]
Ejemplo n.º 2
0
def load_experiment(path_to_yml_file):
    config = load_yaml(path_to_yml_file)
    api_key = os.getenv('COMET_API_KEY', None)
    exp = None

    if not config['info']['experiment_key']:
        if api_key:
            exp = Experiment(api_key=api_key,
                             project_name=config['info']['project_name'])
            exp_key = exp.get_key()
        else:
            exp_key = make_random_string(20)

        os.environ['EXPERIMENT_KEY'] = exp_key

        _env_variables = env_variables + ['EXPERIMENT_KEY']
        config = load_yaml(path_to_yml_file, _env_variables)
        config['info']['experiment_key'] = exp_key
        path_to_yml_file = save_experiment(config, exp)
    else:
        logging.info(
            f"Experiment is already set up @ {config['info']['output_folder']}!"
        )
        try:
            exp = ExistingExperiment(
                api_key=api_key,
                previous_experiment=config['info']['experiment_key'])
        except:
            pass

    return config, exp, path_to_yml_file
Ejemplo n.º 3
0
    def __init__(self,
                 experiment: Experiment,
                 gpu_id=None,
                 print_to_comet_only=False):
        if CometLogger.__experiment is not None:
            raise Exception(
                "Cannot re-instantiate since this class is a singleton.")
        else:

            CometLogger.__experiment = experiment
            CometLogger.__APIExperiment = APIExperiment(
                previous_experiment=experiment.get_key())
            CometLogger.gpu_id = gpu_id
            CometLogger.print_to_comet_only = print_to_comet_only
Ejemplo n.º 4
0
    def get_comet_logger(self):
        if not self.paras.load :
            comet_exp = Experiment(project_name=COMET_PROJECT_NAME,
                                         workspace=COMET_WORKSPACE,
                                         auto_output_logging=None,
                                         auto_metric_logging=None,
                                         display_summary=False,
                                         )
            if self.paras.transfer:
                comet_exp.set_name(self.exp_name)
                comet_exp.add_tag(Path(self.ckpdir).parent.name)
                comet_exp.add_tag('transfer')
                comet_exp.add_tag(self.config['data']['corpus']['metas'][0])
            if self.paras.test:
                comet_exp.set_name(Path(self.paras.outdir).name)
                comet_exp.add_tag(Path(self.paras.config).parents[2].name)
                comet_exp.add_tag('test')
                comet_exp.add_tag(Path(self.paras.config).parent.stem)
                #comet_exp.add_tag(Path(self.paras.outdir).name)
            else:
                comet_exp.add_tag('train')

            for name, param in self.config.items():
                if isinstance(param, dict):
                    comet_exp.log_parameters(param, prefix=name)
                else:
                    comet_exp.log_parameter(name, param)
            comet_exp.log_other('seed', self.paras.seed)


            with open(Path(self.logdir,'exp_key'), 'w') as f:
                print(comet_exp.get_key(),file=f)
        else:
            with open(Path(self.logdir,'exp_key'),'r') as f:
                exp_key = f.read().strip()
                comet_exp = ExistingExperiment(previous_experiment=exp_key,
                                                    project_name=COMET_PROJECT_NAME,
                                                    workspace=COMET_WORKSPACE,
                                                    auto_output_logging=None,
                                                    auto_metric_logging=None,
                                                    display_summary=False,
                                                    )
        return comet_exp
Ejemplo n.º 5
0
def main():
    # Training settings
    args = configure_arguments()

    use_cuda = not args.no_cuda and torch.cuda.is_available()

    # sets seeds to prevent any unwanted randomness.
    torch.manual_seed(args.seed)
    if use_cuda:
        torch.cuda.manual_seed(args.seed)
    random.seed(args.seed)
    torch.backends.cudnn.deterministic = True

    device = torch.device("cuda:1" if use_cuda else "cpu")

    train_loader, val_loader, test_loader = create_dataloaders(args)

    # get instance of model.
    print('Loading the {0} model...'.format(args.model))
    model_class = models.find_model(args.model)
    model = model_class().to(device)
    optimizer = optim.SGD(model.parameters(), lr=args.lr)

    no_of_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print('Model has {0} parameters'.format(no_of_params))

    if args.mode == 'train':
        print('Running in train mode...')

        #set up logging.
        experiment = Experiment(api_key="w7QuiECYXbNiOozveTpjc9uPg", project_name="project1-ac2g", workspace="ift6135")
        create_folder('output/' + args.model + '/' + experiment.get_key())
        hyper_params = vars(args)
        experiment.log_parameters(hyper_params)

        train(args, model, device, (train_loader, val_loader), optimizer, experiment)
    elif args.mode == 'test':
        print('Running in test mode...')
        test(args, model, device, test_loader)
Ejemplo n.º 6
0
class CometMLLogger(ExperimentLogger):
    def __init__(self, provider_args: EasyDict, config, **kwargs):
        self.experiment = Experiment(api_key=provider_args.api_key,
                                     project_name=provider_args.project_name,
                                     workspace=provider_args.workspace,
                                     auto_param_logging=False,
                                     auto_metric_logging=False)
        super().__init__(config)
        self.run_key = self.experiment.get_key()
        self.log_url = self.experiment.url

    def log_on_hyperparameters(self, config: EasyDict):
        hyper_params = {}
        if config is not None:
            hyper_params['model'] = config.model
            hyper_params['trainer'] = config.trainer
            if 'train' in config.dataset and 'augmentations' in config.dataset.train:
                hyper_params[
                    'augmentations'] = config.dataset.train.augmentations
        self.experiment.log_parameters(flatten(hyper_params, reducer='path'))

    def log_on_step_update(self, metrics_log: dict):
        step = metrics_log['step']
        metrics_log.pop('step')
        self.experiment.log_metrics(metrics_log, step=step)

    def log_on_epoch_update(self, metrics_log: dict):
        epoch = metrics_log['epoch']
        metrics_log.pop('epoch')
        self.experiment.log_metrics(metrics_log, epoch=epoch)

    def log_on_model_save(self, file_log: dict):
        pass

    def log_on_validation_result(self, metrics_log: dict):
        epoch = metrics_log['epoch']
        metrics_log.pop('epoch')
        self.experiment.log_metrics(metrics_log, epoch=epoch)
class CometML:
    def __init__(self,
                 api_key,
                 project_name,
                 workspace,
                 debug=True,
                 tags=None):
        self._exp = Experiment(
            api_key=api_key,
            project_name=project_name,
            workspace=workspace,
            disabled=debug,
        )
        if not (self._exp.alive or debug):
            raise RuntimeError("Cannot connect to Comet ML")
        self._exp.disable_mp()

        if tags is not None:
            self._exp.add_tags(tags)

    @property
    def run_name(self):
        return self._exp.get_key()

    def args(self, arg_text):
        self._exp.log_parameter("cmd args", arg_text)

    def meta(self, params):
        self._exp.log_parameters(params)

    def log(self, name, value, step):
        self._exp.log_metric(
            name=name,
            value=value,
            step=step,
        )
    parser.add_argument('--lambda_kernel_reg', type=float, default=1.0)
    parser.add_argument('--lambda_fgsd_kernel_reg', type=float, default=1.0)
    parser.add_argument('--lambda_spk_kernel_reg', type=float, default=1.0)
    parser.add_argument('--lambda_adj_reconst_reg', type=float, default=1.0)

    parser.add_argument('--warmup_epochs', type=float, nargs='*', default=[2.0], help='Number of epochs during which learning rate increases linearly from init_lr to max_lr. Afterwards, learning rate decreases exponentially from max_lr to final_lr.')
    parser.add_argument('--init_lr', type=float, nargs='*', default=[1e-4], help='Initial learning rate')
    parser.add_argument('--max_lr', type=float, nargs='*', default=[1e-3], help='Maximum learning rate')
    parser.add_argument('--final_lr', type=float, nargs='*', default=[1e-4], help='Final learning rate')
    parser.add_argument('--lr_scaler', type=float, nargs='*', default=[1.0], help='Amount by which to scale init_lr, max_lr, and final_lr (for convenience)')
    parser.add_argument('--lr_decay_rate', type=float, default=0.9, help='lr decay per epoch, for decay scheduler')

    args, unknown = parser.parse_known_args()

    experiment = Experiment(api_key=API_KEY, project_name="universal-graph-embedding", workspace="saurabh08", disabled=not args.run_on_comet)
    experiment_id = experiment.get_key()

    data_path = os.path.join(args.data_dir, args.dataset_name)
    log_path = os.path.join(args.log_dir, experiment_id)
    if not os.path.exists(log_path):
        os.makedirs(log_path)
    logging.basicConfig(format='%(message)s', level=logging.INFO, handlers=[logging.StreamHandler(), logging.FileHandler(os.path.join(log_path, 'console_output.txt'))])

    run_filepath = os.path.abspath(__file__)
    shutil.copy(run_filepath, log_path)
    src_list = ['./train', './utils', './torch_dgl', './dataloader', './config']
    dest_list = [os.path.join(log_path, 'train'), os.path.join(log_path, 'utils'), os.path.join(log_path, 'torch_dgl'), os.path.join(log_path, 'dataloader'), os.path.join(log_path, 'config')]
    for src, dest in zip(src_list, dest_list):
            shutil.copytree(src, dest)

    for arg, value in sorted(vars(args).items()):
Ejemplo n.º 9
0
        step_time = round(time.time() - start_time, 1)

        metrics = {metric_name: log(metric) for metric_name, metric in trainer.metrics.items()}
        metrics['step_time'] = step_time

        # validation plotting
        progbar.add(valid_inc, [('Train Loss', metrics['train_loss']),
                                ('Validation Loss', metrics['valid_loss']),
                                ('Time (s)', step_time)])
        #Plot on Comet
        #experiment.log_metrics(metrics,step=t)
        # Plot on WandB
        wandb.log(metrics, step=t)

    if (t+0) % save_inc == 0: # zero while we test this
        trainer.save_weights(model_path, run_id=wandb.run.id, experiment_key=experiment.get_key())

        # if not args.images:
            # How we plot the cluster figs
        # try:
        #     if not args.discrete:
        #         batches = [trainer.make_sequences_variable_length(dataset_coordinator.plotting_background_dataset.next()) for i in range(0,4)]
        #         super_batch = {}
        #         for k in batches[0].keys():
        #             super_batch[k] = np.concatenate([b[k] for b in batches])
        #         lang_batch = dataset_coordinator.labelled_test_ds.next()
        #         fig_enc, fig_plan, z_enc, z_plan = lfp.plotting.produce_cluster_fig(super_batch, lang_batch, trainer, args=args)
        #         #if not args.gcbc and not args.images:
        #         #   z_enc, z_plan = produce_cluster_fig(next(plotting_dataset), encoder, planner, TEST_DATA_PATHS[0], num_take=dl.batch_size//4)

        #         #   #Comet
Ejemplo n.º 10
0
def experiment(doodad_config, variant):
    from rlkit.core import logger
    from rlkit.launchers.launcher_util import setup_logger
    print ("doodad_config.base_log_dir: ", doodad_config.base_log_dir)
    from datetime import datetime
    timestamp = datetime.now().strftime('%Y_%m_%d_%H_%M_%S_%f')
    setup_logger('wrapped_'+variant['env'], variant=variant, log_dir=doodad_config.base_log_dir+"/smirl/"+variant['exp_name']+"/"+timestamp+"/")
    if (variant["log_comet"]):
        try:
            comet_logger = Experiment(api_key=launchers.config.COMET_API_KEY,
                                         project_name=launchers.config.COMET_PROJECT_NAME, 
                                         workspace=launchers.config.COMET_WORKSPACE)
            logger.set_comet_logger(comet_logger)
            comet_logger.set_name(str(variant['env'])+"_"+str(variant['exp_name']))
            print("variant: ", variant)
            variant['comet_key'] = comet_logger.get_key()
            comet_logger.log_parameters(variant)
            print(comet_logger)
        except Exception as inst:
            print ("Not tracking training via commet.ml")
            print ("Error: ", inst)

    import gym
    from torch import nn as nn
    
    import rlkit.torch.pytorch_util as ptu
    import torch
    from rlkit.exploration_strategies.epsilon_greedy import EpsilonGreedy
    from rlkit.exploration_strategies.base import \
        PolicyWrappedWithExplorationStrategy
    from rlkit.policies.argmax import ArgmaxDiscretePolicy
    from rlkit.torch.dqn.dqn import DQNTrainer
    from rlkit.data_management.env_replay_buffer import EnvReplayBuffer
    from rlkit.samplers.data_collector import MdpPathCollector
    from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm
    from surprise.utils.rendering_algorithm import TorchBatchRLRenderAlgorithm
    from surprise.envs.tetris.tetris import TetrisEnv
    from surprise.wrappers.obsresize import ResizeObservationWrapper, RenderingObservationWrapper, SoftResetWrapper
    import pdb
    
    base_env = get_env(variant)
    base_env2 = get_env(variant)
    
    print ("GPU_BUS_Index", variant["GPU_BUS_Index"])
    if torch.cuda.is_available() and doodad_config.use_gpu:
        print ("Using the GPU for learning")
#         ptu.set_gpu_mode(True, gpu_id=doodad_config.gpu_id)
        ptu.set_gpu_mode(True, gpu_id=variant["GPU_BUS_Index"])
    else:
        print ("NOT Using the GPU for learning")
    
#     base_env2 = RenderingObservationWrapper(base_env2)
    expl_env, network = add_wrappers(base_env, variant, device=ptu.device)
    eval_env, _ = add_wrappers(base_env2, variant, device=ptu.device, eval=True, network=network)
    if ("vae_wrapper" in variant["wrappers"]):
        eval_env._network = base_env._network
    
    obs_dim = expl_env.observation_space.low.shape
    print("Final obs dim", obs_dim)
    action_dim = eval_env.action_space.n
    print("Action dimension: ", action_dim)
    qf, target_qf = get_network(variant["network_args"], obs_dim, action_dim)
    qf_criterion = nn.MSELoss()
    eval_policy = ArgmaxDiscretePolicy(qf)
    if "prob_random_action" in variant:
        expl_policy = PolicyWrappedWithExplorationStrategy(
            EpsilonGreedy(expl_env.action_space, prob_random_action=variant["prob_random_action"], 
                          prob_end=variant["prob_end"],
                          steps=variant["steps"]),
            eval_policy,
        )
    else:  
        expl_policy = PolicyWrappedWithExplorationStrategy(
            EpsilonGreedy(expl_env.action_space, prob_random_action=0.8, prob_end=0.05),
            eval_policy,
        )
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
        render_kwargs=variant['render_kwargs']
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        expl_policy,
    )
    trainer = DQNTrainer(
        qf=qf,
        target_qf=target_qf,
        qf_criterion=qf_criterion,
        **variant['trainer_kwargs']
    )
    replay_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )
    algorithm = TorchBatchRLRenderAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs']
    )
    algorithm.to(ptu.device)
    algorithm.train()
Ejemplo n.º 11
0
class Reptile(Task):
    """
    A meta-learning task that teaches an agent over a set of other tasks
    """
    def __init__(self,
                 data_handler,
                 load_key=None,
                 sender=True,
                 receiver=True,
                 image_captioner=True,
                 image_selector=False,
                 track_results=True):
        self.sess = Agent.sess
        self.N = 1  # number of steps taken for each task - should be > 1

        self.S = SenderAgent()
        self.R = ReceiverAgent(*self.S.get_output())
        self.IC = ImageCaptioner()
        # self.IS = ImageSelector()

        self.S.all_agents_initialized(load_key)
        self.R.all_agents_initialized(load_key)

        self.train_metrics = {}
        self.val_metrics = {}
        self.experiment = Experiment(api_key='1jl4lQOnJsVdZR6oekS6WO5FI',
                                     project_name='Reptile',
                                     auto_param_logging=False,
                                     auto_metric_logging=False,
                                     disabled=(not track_results))

        self.params = {}
        self.params.update(Agent.get_params())
        self.params.update(data_handler.get_params())
        self.experiment.log_parameters(self.params)

        self.T = {}
        if image_captioner:
            self.ic = ImageCaptioning(self.IC,
                                      experiment=self.experiment,
                                      track_results=False)
            self.T["Image Captioner"] = lambda img, capts: self.ic.train_batch(
                (img, capts), mode="train")
        if image_selector:
            self.is_ = ImageSelection(self.IS,
                                      experiment=self.experiment,
                                      track_results=False)
            self.T["Image Selector"] = lambda img, capts: self.is_.train_batch(
                (img, capts), mode="train")
        if sender or receiver:
            self.rg = ReferentialGame(self.S,
                                      self.R,
                                      experiment=self.experiment,
                                      track_results=False)
            if receiver:
                self.T["Receiver"] = lambda img, capts: self.rg.train_batch(
                    img, mode="receiver_train")
            if sender:
                self.T["Sender"] = lambda img, capts: self.rg.train_batch(
                    img, mode="sender_train")

        # Initialize TF
        variables_to_initialize = tf.global_variables()
        if load_key is not None:
            dont_initialize = []
            if SenderAgent.loaded:
                dont_initialize += SenderAgent.get_all_weights()
            if ReceiverAgent.loaded:
                dont_initialize += ReceiverAgent.get_all_weights()
            if ImageCaptioner.loaded:
                dont_initialize += ImageCaptioner.get_all_weights()
            variables_to_initialize = [
                v for v in tf.global_variables() if v not in dont_initialize
            ]
            # REMOVE LATER
            #variables_to_initialize += ImageCaptioner.optimizer.variables()
        Agent.sess.run(tf.variables_initializer(variables_to_initialize))

        self.sender_shared_state = VariableState(
            self.sess, SenderAgent.get_shared_weights())
        self.receiver_shared_state = VariableState(
            self.sess, ReceiverAgent.get_shared_weights())
        self.sender_own_state = VariableState(self.sess,
                                              SenderAgent.get_weights())
        self.receiver_own_state = VariableState(self.sess,
                                                ReceiverAgent.get_weights())

        # print(SenderAgent.get_shared_weights())
        # print(ReceiverAgent.get_shared_weights())
        # print(SenderAgent.get_weights())
        # print(ReceiverAgent.get_weights())
        # print(tf.trainable_variables())

        self.shared_states = {
            "shared_sender": self.sender_shared_state,
            "shared_receiver": self.receiver_shared_state
        }
        self.own_states = {
            "own_sender": self.sender_own_state,
            "own_receiver": self.receiver_own_state
        }

        shared_average = []
        for k, v in self.shared_states.items():
            shared_average.append(v.export_variables())

        shared_average = np.mean(shared_average, axis=0)
        self.set_weights(new_shared_weights=shared_average)

        self.dh = data_handler
        with open(
                "{}/data/csv_loss_{}.csv".format(project_path,
                                                 self.experiment.get_key()),
                'w+') as csv_loss_file:
            csv_loss_file.write(
                "Image Captioner Loss,Image Selector Loss,Sender Loss,Receiver Loss\n"
            )
        with open(
                "{}/data/csv_accuracy_{}.csv".format(
                    project_path, self.experiment.get_key()),
                'w+') as csv_acc_file:
            csv_acc_file.write(
                "Image Captioner Loss,Image Selector Loss,Sender Loss,Receiver Loss\n"
            )

        self.step = 0

    def get_diff(self, a, b):
        diff = 0.
        if isinstance(a, (np.ndarray, np.generic)):
            return np.sum(np.abs(a - b))

        elif isinstance(a, list):
            for i in range(len(a)):
                diff += self.get_diff(a[i], b[i])

        elif isinstance(a, dict):
            for k in a:
                diff += self.get_diff(a[k], b[k])

        return diff

    def set_weights(self, new_own_weights=None, new_shared_weights=None):
        if new_own_weights is not None:
            for k, s in self.own_states.items():
                s.import_variables(new_own_weights[k])
        if new_shared_weights is not None:
            for k, s in self.shared_states.items():
                s.import_variables(new_shared_weights)

    def train_epoch(self, e, mode=None):
        self.dh.set_params(distractors=0)
        image_gen = self.dh.get_images(return_captions=True, mode="train")
        # Get current variables
        start_vars = {
            k: s.export_variables()
            for k, s in self.own_states.items()
        }
        start_vars["shared"] = self.shared_states[
            "shared_sender"].export_variables()

        while True:
            try:

                # Save current variables
                old_own = {
                    k: s.export_variables()
                    for k, s in self.own_states.items()
                }
                new_own = {k: [] for k, s in self.own_states.items()}
                old_shared = self.shared_states[
                    "shared_sender"].export_variables()
                new_shared = []

                # For each task
                for task in ["Image Captioner", "Sender", "Receiver"]:
                    # parameter setup to not waste data
                    if task in ["Sender", "Receiver", "Image Selector"]:
                        self.dh.set_params(distractors=Agent.D)
                    else:
                        self.dh.set_params(distractors=0)
                    # Run task n times
                    for _ in range(self.N):
                        images, captions = next(image_gen)
                        acc, loss = self.T[task](images, captions)
                    self.train_metrics[task + " Accuracy"] = acc
                    self.train_metrics[task + " Loss"] = loss

                    # Store new variables
                    [
                        new_own[k].append(s.export_variables())
                        for k, s in self.own_states.items()
                    ]
                    [
                        new_shared.append(s.export_variables())
                        for k, s in self.shared_states.items()
                    ]

                    # Reset to old variables for next task
                    [
                        s.import_variables(old_own[k])
                        for k, s in self.own_states.items()
                    ]
                    [
                        s.import_variables(old_shared)
                        for k, s in self.shared_states.items()
                    ]

                self.step += 1
                self.experiment.set_step(self.step)
                self.experiment.log_metrics(self.train_metrics)
                # Average new variables
                new_own = {
                    k: interpolate_vars(old_own[k], average_vars(new_own[k]),
                                        0.2)
                    for k, s in self.own_states.items()
                }
                new_shared = interpolate_vars(old_shared,
                                              average_vars(new_shared), 0.2)
                # Set variables to new variables
                self.set_weights(new_own_weights=new_own,
                                 new_shared_weights=new_shared)

            except StopIteration:
                break

        # Get change in weights
        end_vars = {
            k: s.export_variables()
            for k, s in self.own_states.items()
        }
        end_vars["shared"] = self.shared_states[
            "shared_sender"].export_variables()
        weight_diff = self.get_diff(start_vars, end_vars)

        #self.experiment.set_step(e)
        self.val_metrics["Weight Change"] = weight_diff
        self.experiment.log_metrics(self.val_metrics)

        # Log data to a csv
        with open("{}/data/csv_loss_{}.csv".format(project_path, self.experiment.get_key()), 'a') as csv_loss_file, \
             open("{}/data/csv_accuracy_{}.csv".format(project_path, self.experiment.get_key()), 'a') as csv_acc_file:
            losses = []
            accs = []
            for task in ["Image Captioner", "Sender", "Receiver"]:
                losses.append(str(self.train_metrics[task + " Loss"]))
                accs.append(str(self.train_metrics[task + " Accuracy"]))

            csv_loss_file.write(",".join(losses))
            csv_loss_file.write("\n")

            csv_acc_file.write(",".join(accs))
            csv_acc_file.write("\n")

        return 0, weight_diff
Ejemplo n.º 12
0
from allennlp.modules.augmented_lstm import AugmentedLstm
from torch import nn
from torch.utils.data.sampler import SubsetRandomSampler
from sklearn.metrics import classification_report
from tqdm import tqdm

from dataset import ProielDataset

COMET_API_KEY = os.getenv('COMET_API_KEY')
experiment = Experiment(
    api_key=COMET_API_KEY,
    project_name='deep-latin-tagger',
    workspace='tylerkirby',
)

EXPERIMENT_HASH = experiment.get_key()


class BayesianDropoutLSTM(nn.Module):
    def __init__(self,
                 vocab_size,
                 tag_size,
                 X_lengths,
                 embedding_dim,
                 hidden_size,
                 recurrent_dropout_probability=0):
        super(BayesianDropoutLSTM, self).__init__()
        self.X_lengths = X_lengths
        self.embedding_layer = nn.Embedding(vocab_size, embedding_dim)
        self.augmented_lstm = AugmentedLstm(
            input_size=embedding_dim,
Ejemplo n.º 13
0
if len(modes) == 0 or len([x for x in modes if x not in ['train', 'test']]):
    print('please provide train or test modes')
    exit(1)

# OPTIONAL COMET DATA LOGGING SETUP #
experiment = None
if log_comet:
    from comet_ml import Experiment
    config = util.load_config()
    experiment = Experiment(api_key=config['comet']['api_key'],
                            project_name=config['comet']['project_name'],
                            workspace=config['comet']['workspace'])
# END OPTIONAL COMET DATA LOGGING SETUP #

dir_name = experiment.get_key() if experiment is not None else str(
    int(time.time()))

checkpoints_dir = None
if save_checkpoints:
    checkpoints_dir = '{}/{}'.format(checkpoints_root_dir, dir_name)

if 'test' in modes:
    results_dir = '{}/{}'.format(results_root_dir, dir_name)

# ADDITIONAL IMPORTS # - imports are split because comet_ml requires being imported before torch
from dataset.dataset_loader import DatasetLoader
from model.agent import DDPG
from model.util import determine_device
from train import train
from test import test
Ejemplo n.º 14
0
# OPTIONAL COMET DATA LOGGING SETUP #
experiment = None

if log_comet:
    from comet_ml import Experiment

    config = util.load_config()
    experiment = Experiment(api_key=config['comet']['api_key'],
                            project_name=config['comet']['project_name'],
                            workspace=config['comet']['workspace'])
# END OPTIONAL COMET DATA LOGGING SETUP #

# SETUP CHECKPOINTS DIR #
if save_checkpoints:
    checkpoints_dir_name = experiment.get_key(
    ) if experiment is not None else str(int(start))
    checkpoints_dir = '{}/{}'.format(checkpoints_root_dir,
                                     checkpoints_dir_name)
    os.makedirs(checkpoints_dir, exist_ok=True)
else:
    checkpoints_dir = None
# END SETUP CHECKPOINTS DIR #

# SETUP RESULTS DIR #
if 'test' in modes:
    results_dir_name = experiment.get_key() if experiment is not None else str(
        int(start))

    if 'train' in modes and save_checkpoints:
        results_dir_name = checkpoints_dir_name
    elif load_model:
Ejemplo n.º 15
0
class Dashboard:
    """Record training/evaluation statistics to comet
    :param Path log_dir
    :param list taskid_to_name
    """
    def __init__(self, config, paras, log_dir, train_type, resume=False):
        self.log_dir = log_dir
        self.expkey_f = Path(self.log_dir, 'exp_key')
        self.global_step = 1

        if resume:
            assert self.expkey_f.exists(
            ), f"Cannot find comet exp key in {self.log_dir}"
            with open(Path(self.log_dir, 'exp_key'), 'r') as f:
                exp_key = f.read().strip()
            self.exp = ExistingExperiment(
                previous_experiment=exp_key,
                project_name=COMET_PROJECT_NAME,
                workspace=COMET_WORKSPACE,
                auto_output_logging=None,
                auto_metric_logging=None,
                display_summary_level=0,
            )
        else:
            self.exp = Experiment(
                project_name=COMET_PROJECT_NAME,
                workspace=COMET_WORKSPACE,
                auto_output_logging=None,
                auto_metric_logging=None,
                display_summary_level=0,
            )
            #TODO: is there exists better way to do this?
            with open(self.expkey_f, 'w') as f:
                print(self.exp.get_key(), file=f)

            self.exp.log_other('seed', paras.seed)
            self.log_config(config)
            if train_type == 'evaluation':
                if paras.pretrain:
                    self.exp.set_name(
                        f"{paras.pretrain_suffix}-{paras.eval_suffix}")
                    self.exp.add_tags([
                        paras.pretrain_suffix, config['solver']['setting'],
                        paras.accent, paras.algo, paras.eval_suffix
                    ])
                    if paras.pretrain_model_path:
                        self.exp.log_other("pretrain-model-path",
                                           paras.pretrain_model_path)
                    else:
                        self.exp.log_other("pretrain-runs",
                                           paras.pretrain_runs)
                        self.exp.log_other("pretrain-setting",
                                           paras.pretrain_setting)
                        self.exp.log_other("pretrain-tgt-accent",
                                           paras.pretrain_tgt_accent)
                else:
                    self.exp.set_name(paras.eval_suffix)
                    self.exp.add_tags(
                        ["mono", config['solver']['setting'], paras.accent])
            else:
                self.exp.set_name(paras.pretrain_suffix)
                self.exp.log_others({
                    f"accent{i}": k
                    for i, k in enumerate(paras.pretrain_accents)
                })
                self.exp.log_other('accent', paras.tgt_accent)
                self.exp.add_tags([
                    paras.algo, config['solver']['setting'], paras.tgt_accent
                ])
            #TODO: Need to add pretrain setting

        ##slurm-related
        hostname = os.uname()[1]
        if len(hostname.split('.')) == 2 and hostname.split(
                '.')[1] == 'speech':
            logger.notice(f"Running on Battleship {hostname}")
            self.exp.log_other('jobid', int(os.getenv('SLURM_JOBID')))
        else:
            logger.notice(f"Running on {hostname}")

    def log_config(self, config):
        #NOTE: depth at most 2
        for block in config:
            for n, p in config[block].items():
                if isinstance(p, dict):
                    self.exp.log_parameters(p, prefix=f'{block}-{n}')
                else:
                    self.exp.log_parameter(f'{block}-{n}', p)

    def set_status(self, status):
        self.exp.log_other('status', status)

    def step(self, n=1):
        self.global_step += n

    def set_step(self, global_step=1):
        self.global_step = global_step

    def log_info(self, prefix, info):
        self.exp.log_metrics({k: float(v)
                              for k, v in info.items()},
                             prefix=prefix,
                             step=self.global_step)

    def log_other(self, name, value):
        self.exp.log_metric(name, value, step=self.global_step)

    def log_step(self):
        self.exp.log_other('step', self.global_step)

    def add_figure(self, fig_name, data):
        self.exp.log_figure(figure_name=fig_name,
                            figure=data,
                            step=self.global_step)

    def check(self):
        if not self.exp.alive:
            logger.warning("Comet logging stopped")
Ejemplo n.º 16
0
model = torch.nn.DataParallel(model).cuda()

opt = torch.optim.Adam(model.parameters(), lr=hparams['learning_rate'])

all_losses = [back_loss_tr_loss_name] + \
             [k for k in sorted(val_losses.keys())] + \
             [k for k in sorted(tr_val_losses.keys())]

tr_step = 0
val_step = 0
for i in range(hparams['n_epochs']):
    res_dic = {}
    for loss_name in all_losses:
        res_dic[loss_name] = {'mean': 0., 'std': 0., 'acc': []}
    print("Experiment: {} - {} || Epoch: {}/{}".format(experiment.get_key(),
                                                       experiment.get_tags(),
                                                       i + 1,
                                                       hparams['n_epochs']))
    model.train()

    for data in tqdm(train_gen, desc='Training'):
        opt.zero_grad()
        m1wavs = data[0].unsqueeze(1).cuda()
        clean_wavs = data[-1].cuda()

        rec_sources_wavs = model(m1wavs)

        l = back_loss_tr_loss(rec_sources_wavs,
                              clean_wavs,
                              initial_mixtures=m1wavs)
Ejemplo n.º 17
0
def cli_main():
    parser = options.get_training_parser()
    parser.add_argument(
        "--comet-logging",
        action="store_true",
        help="Whether to use Comet.ML for logging",
    )
    args = options.parse_args_and_arch(parser)

    logging = getattr(args, "comet_logging", False)
    config = None
    if logging:
        PROJECT = "machine-translation"
        if not keyring.get_password("comet", PROJECT):
            comet_ml_api_key = getpass("Please enter the comet.ml API key: ")
            keyring.set_password("comet", PROJECT, comet_ml_api_key)
        else:
            comet_ml_api_key = keyring.get_password("comet", PROJECT)

        experiment = Experiment(
            api_key=comet_ml_api_key,
            project_name="machine-translation",
            workspace="machine-translation",
            auto_output_logging=None,
        )
        config = {
            "api_key": comet_ml_api_key,
            "experiment_key": experiment.get_key()
        }
        print("Proceeding with Comet.ML logging...")

    if args.distributed_init_method is None:
        distributed_utils.infer_init_method(args)

    if args.distributed_init_method is not None:
        # distributed training
        if torch.cuda.device_count() > 1 and not args.distributed_no_spawn:
            start_rank = args.distributed_rank
            args.distributed_rank = None  # assign automatically
            torch.multiprocessing.spawn(
                fn=distributed_main,
                args=(args, config, start_rank),
                nprocs=torch.cuda.device_count(),
            )
        else:
            distributed_main(args.device_id, args, config)
    elif args.distributed_world_size > 1:
        # fallback for single node with multiple GPUs
        assert args.distributed_world_size <= torch.cuda.device_count()
        port = random.randint(10000, 20000)
        args.distributed_init_method = "tcp://localhost:{port}".format(
            port=port)
        args.distributed_rank = None  # set based on device id
        if max(args.update_freq) > 1 and args.ddp_backend != "no_c10d":
            print(
                "| NOTE: you may get better performance with: --ddp-backend=no_c10d"
            )
        torch.multiprocessing.spawn(fn=distributed_main,
                                    args=(args, config),
                                    nprocs=args.distributed_world_size)
    else:
        # single GPU training
        main(args, config=config)
    if config:
        experiment.end()
Ejemplo n.º 18
0
class DWIMLAbstractTrainer:
    """
    This Trainer class's train_and_validate() method:
        - Creates DataLoaders from the data_loaders. Collate_fn will be the
        loader.load_batch() method, and the dataset will be
        sampler.source_data.
        - Trains each epoch by using compute_batch_loss, which should be
        implemented in each project's child class.

    Comet is used to save training information, but some logs will also be
    saved locally in the saving_path.
    """
    def __init__(self,
                 model: MainModelAbstract,
                 experiments_path: str,
                 experiment_name: str,
                 batch_sampler_training: DWIMLBatchSampler,
                 batch_loader_training: AbstractBatchLoader,
                 batch_sampler_validation: DWIMLBatchSampler = None,
                 batch_loader_validation: AbstractBatchLoader = None,
                 model_uses_streamlines: bool = False,
                 learning_rate: float = 0.001,
                 weight_decay: float = 0.01,
                 max_epochs: int = 10,
                 max_batches_per_epoch: int = 1000,
                 patience: int = None,
                 nb_cpu_processes: int = 0,
                 use_gpu: bool = False,
                 comet_workspace: str = None,
                 comet_project: str = None,
                 from_checkpoint: bool = False,
                 log_level=logging.root.level):
        """
        Parameters
        ----------
        model: MainModelAbstract
            Instatiated class containing your model.
        experiments_path: str
            Path where to save this experiment's results and checkpoints.
            Will be saved in experiments_path/experiment_name.
        experiment_name: str
            Name of this experiment. This will also be the name that will
            appear online for comet.ml experiment.
        batch_sampler_training: DWIMLBatchSampler
            Instantiated class used for sampling batches of training data.
            Data in batch_sampler_training.source_data must be already loaded.
        batch_loader_training: AbstractBatchLoader
            Instantiated class with a load_batch method able to load data
            associated to sampled batch ids.
        batch_sampler_validation: DWIMLBatchSampler
            Similar as before, for the validation set. Can be set to None if no
            validation is used. Then, best model is based on training loss.
        batch_loader_validation: AbstractBatchLoader
            Again, similar as before but can be set to None.
        model_uses_streamlines: bool
            If true, the batch streamlines will be sent to the model when
            calling the forward method. Else, only the inputs. Default: False.
        learning_rate: float
            Learning rate. Default: 0.001 (torch's default)
        weight_decay: float
            Add a weight decay penalty on the parameters. Default: 0.01.
            (torch's default).
        max_epochs: int
            Maximum number of epochs. Default = 10, for no good reason.
        max_batches_per_epoch: int
            Maximum number of batches per epoch. Default = 10000, for no good
            reason.
        patience: int
            Use early stopping. Defines the number of epochs after which the
            model should stop if the loss hasn't improved. Default: None (i.e.
            no early stopping).
        nb_cpu_processes: int
            Number of parallel CPU workers. Use 0 to avoid parallel threads.
            Default : 0.
        use_gpu: bool
            If true, use GPU device when possible instead of CPU.
            Default = False
        comet_workspace: str
            Your comet workspace. See our docs/Getting Started for more
            information on comet and its API key. Default= None (comet.ml will
            not be used).
        comet_project: str
             Send your experiment to a specific comet.ml project. Default: None
             (it will be sent to Uncategorized Experiments).
        from_checkpoint: bool
             If true, we do not create the output dir, as it should already
             exist. Default: False.
        """
        # To developers: do not forget that changes here must be reflected
        # in the save_checkpoint method!

        # ----------------------
        # Values given by the user
        # ----------------------

        # Trainer's logging level can be changed separately from main
        # scripts.
        self.logger = logger
        self.logger.setLevel(log_level)

        # Experiment
        if not os.path.isdir(experiments_path):
            raise NotADirectoryError("The experiments path does not exist! "
                                     "({})".format(experiments_path))

        self.experiments_path = experiments_path
        self.saving_path = os.path.join(experiments_path, experiment_name)
        if not from_checkpoint and not os.path.isdir(self.saving_path):
            logging.info('Creating directory {}'.format(self.saving_path))
            os.mkdir(self.saving_path)

        self.experiment_name = experiment_name
        self.saving_path = os.path.join(self.experiments_path,
                                        self.experiment_name)
        if not from_checkpoint and not os.path.isdir(self.saving_path):
            logger.info('Creating directory {}'.format(self.saving_path))
            os.mkdir(self.saving_path)

        # Note that the training/validation sets are contained in the
        # data_loaders.data_source
        self.train_batch_sampler = batch_sampler_training
        self.valid_batch_sampler = batch_sampler_validation
        if self.valid_batch_sampler is None:
            self.use_validation = False
            self.logger.warning(
                "WARNING! There is not validation set. Loss for best epoch "
                "monitoring will be the training loss. \n"
                "Best practice is to have a validation set.")
        else:
            self.use_validation = True
        self.train_batch_loader = batch_loader_training
        self.valid_batch_loader = batch_loader_validation
        self.model = model
        self.model_uses_streamlines = model_uses_streamlines
        self.max_epochs = max_epochs
        self.max_batches_per_epochs = max_batches_per_epoch
        self.learning_rate = learning_rate
        self.weight_decay = weight_decay
        self.patience = patience
        self.nb_cpu_processes = nb_cpu_processes
        self.use_gpu = use_gpu

        self.comet_workspace = comet_workspace
        self.comet_project = comet_project

        # ----------------------
        # Values fixed by us
        # ----------------------

        # Device and rng value. Note that if loading from a checkpoint, the
        # complete state should be updated.
        if use_gpu:
            if torch.cuda.is_available():
                self.device = torch.device('cuda')

                # Setting the rng seed
                if (self.use_validation and self.train_batch_sampler.rng !=
                        self.valid_batch_sampler.rng):
                    raise ValueError("Training and validation batch samplers "
                                     "do not have the same rng. Please verify "
                                     "the code.")
                # If you see a hint error below, upgrade torch.
                torch.cuda.manual_seed(self.train_batch_sampler.rng)
            else:
                raise ValueError("You chose GPU (cuda) device but it is not "
                                 "available!")
        else:
            self.device = torch.device('cpu')

        # ----------------------
        # Values that will be modified later on. If initializing experiment
        # from a checkpoint, these values should be updated after
        # initialization.
        # ----------------------
        if patience:
            self.best_epoch_monitoring = BestEpochMonitoring(
                patience=self.patience)
        else:
            # We won't use early stopping to stop the epoch, but we will use
            # it as monitor of the best epochs.
            self.best_epoch_monitoring = BestEpochMonitoring(
                patience=self.max_batches_per_epochs + 1)

        self.current_epoch = 0

        # Nb of batches with be estimated later on
        self.nb_train_batches_per_epoch = None
        self.nb_valid_batches_per_epoch = None

        # RNG state
        # Nothing to to here.

        # Setup monitors
        self.train_loss_monitor = ValueHistoryMonitor("Training loss")
        self.valid_loss_monitor = ValueHistoryMonitor("Validation loss")
        self.grad_norm_monitor = ValueHistoryMonitor("Grad Norm")

        # Comet values will be instantiated in train().
        self.comet_exp = None
        self.comet_key = None

        # ----------------------
        # Launching optimizer!
        # ----------------------

        # Prepare optimizer
        # Send model to device. Reminder, contrary to tensors, model.to
        # overwrites the model.
        # NOTE: This ordering is important! The optimizer needs to use the cuda
        # Tensors if using the GPU...
        self.model.to(device=self.device)

        # Build optimizer (Optimizer is built here since it needs the model
        # parameters)
        list_params = [n for n, _ in self.model.named_parameters()]
        self.logger.debug("Initiating trainer: {}".format(type(self)))
        self.logger.debug(
            "This trainer will use Adam optimization on the "
            "following model.parameters: \n\n".join(list_params) + "\n")
        self.optimizer = torch.optim.Adam(self.model.parameters(),
                                          lr=learning_rate,
                                          weight_decay=weight_decay)

    @property
    def params_for_checkpoint(self):
        # These are the parameters necessary to use _init_
        params = {
            'model_uses_streamlines': self.model_uses_streamlines,
            'learning_rate': self.learning_rate,
            'weight_decay': self.weight_decay,
            'max_epochs': self.max_epochs,
            'max_batches_per_epoch': self.max_batches_per_epochs,
            'patience': self.patience,
            'nb_cpu_processes': self.nb_cpu_processes,
            'use_gpu': self.use_gpu,
            'comet_workspace': self.comet_workspace,
            'comet_project': self.comet_project
        }
        return params

    @property
    def params(self) -> dict:
        params = self.params_for_checkpoint
        params.update({
            'experiments_path': self.experiments_path,
            'experiment_name': self.experiment_name,
            'comet_key': self.comet_key,
            'computed_values': {
                'nb_training_batches_per_epoch':
                self.nb_train_batches_per_epoch,
                'nb_validation_batches_per_epoch':
                self.nb_valid_batches_per_epoch
            }
        })
        return params

    def _init_comet(self):
        """
        For more information on comet, see our doc/Getting Started
        """
        try:
            if self.comet_key:
                self.comet_exp = ExistingExperiment(
                    previous_experiment=self.comet_key)
            elif self.comet_workspace:
                # New experiment
                # Use trainset name as comet project name
                project_name = self.comet_project
                self.comet_exp = CometExperiment(
                    project_name=project_name,
                    workspace=self.comet_workspace,
                    log_code=False,
                    log_graph=True,
                    auto_param_logging=True,
                    auto_metric_logging=False,
                    parse_args=False,
                    auto_output_logging='native',
                    log_env_details=True,
                    log_env_gpu=True,
                    log_env_cpu=True,
                    log_env_host=False,
                    log_git_metadata=True,
                    log_git_patch=True,
                    display_summary=False)
                self.comet_exp.set_name(self.experiment_name)
                self.comet_exp.log_parameters(self.params)
                self.comet_key = self.comet_exp.get_key()
        except ConnectionError:
            self.logger.warning(
                "Could not connect to Comet.ml, metrics will not be logged "
                "online...")
            self.comet_exp = None
            self.comet_key = None

    def estimate_nb_batches_per_epoch(self):
        """
        Please override in your child class if you have a better way to
        define the epochs sizes.

        Returns:
             (nb_training_batches_per_epoch, nb_validation_batches_per_epoch)
        """
        return self.max_batches_per_epochs, self.max_batches_per_epochs

    def train_and_validate(self, *args):
        """
        Train + validates the model (+ computes loss)

        - Starts comet,
        - Creates DataLoaders from the BatchSamplers,
        - For each epoch
            - uses _train_one_epoch and _validate_one_epoch,
            - checks for earlyStopping if the loss is bad,
            - saves the model if the loss is good.
        - Checks if allowed training time is exceeded.

        Parameters
        ----------
        All *args will be passed all the way to _train_one_epoch and
        _train_one_batch, in case you want to override them.
        """
        self.logger.debug("Trainer {}: \n"
                          "Running the model {}.\n\n".format(
                              type(self), type(self.model)))

        # If data comes from checkpoint, this is already computed
        if self.nb_train_batches_per_epoch is None:
            self.logger.info("Estimating batch sizes.")
            (self.nb_train_batches_per_epoch,
             self.nb_valid_batches_per_epoch) = \
                self.estimate_nb_batches_per_epoch()

        # Instantiate comet experiment
        # If self.comet_key is None: new experiment, will create a key
        # Else, resuming from checkpoint. Will continue with given key.
        self._init_comet()
        if self.comet_exp:
            train_context = self.comet_exp.train_and_validate
            valid_context = self.comet_exp.validate
        else:
            # Instantiating contexts doing nothing instead
            train_context = contextlib2.nullcontext
            valid_context = contextlib2.nullcontext

        # Create DataLoaders from the BatchSamplers
        #   * Pin memory if interpolation is done by workers; this means that
        #     dataloader output is on GPU, ready to be fed to the model.
        #     Otherwise, dataloader output is kept on CPU, and the main thread
        #     sends volumes and coords on GPU for interpolation.
        self.logger.debug("- Instantiating dataloaders...")

        # toDo We wouldn't need training / valid batch samplers and loaders if
        #  I knew how to add option 'training' and 'validation' to the
        #  __iter__ method or to the collate_fn (load_batch). But maybe the
        #  user wants separate options. During validation and training. Ex:
        #  less on-the-fly noise addition to the streamlines during validation?
        #  But I don't see why we wouldn't want the same batch sampler. We
        #  could have only one and use the same.copy() and change the value of
        #  the subset to training or validation.
        #  If we also don't think users want different load_batch, solution
        #  could be (for the dataloader) the collate_fn could be nothing, and
        #  we call load_data() ourselves after, with options.

        train_dataloader = DataLoader(
            self.train_batch_sampler.dataset,
            batch_sampler=self.train_batch_sampler,
            num_workers=self.nb_cpu_processes,
            collate_fn=self.train_batch_loader.load_batch,
            pin_memory=self.use_gpu)

        valid_dataloader = None
        if self.use_validation:
            valid_dataloader = DataLoader(
                self.valid_batch_sampler.dataset,
                batch_sampler=self.valid_batch_sampler,
                num_workers=self.nb_cpu_processes,
                collate_fn=self.valid_batch_loader.load_batch,
                pin_memory=self.use_gpu)

        # Instantiating our IterTimer.
        # After each iteration, checks that the maximum allowed time has not
        # been reached.
        iter_timer = IterTimer(history_len=20)

        # Start from current_epoch in case the experiment is resuming
        # Train each epoch
        for epoch in iter_timer(range(self.current_epoch, self.max_epochs)):
            # Updating current epoch. First epoch is 0!
            self.current_epoch = epoch

            # Training
            self.logger.info(
                "**********TRAINING: Epoch #{}*************".format(epoch))
            self.train_one_epoch(train_dataloader, train_context, epoch)

            # Validation
            if self.use_validation:
                self.logger.info(
                    "**********VALIDATION: Epoch #{}*************".format(
                        epoch))
                self.validate_one_epoch(valid_dataloader, valid_context, epoch,
                                        *args)

                last_loss = self.valid_loss_monitor.epochs_means_history[-1]
            else:
                last_loss = self.train_loss_monitor.epochs_means_history[-1]

            # Updating info
            self.best_epoch_monitoring.update(last_loss, epoch)

            # Check for early stopping
            if self.best_epoch_monitoring.is_patience_reached:
                self.save_checkpoint()
                raise EarlyStoppingError(
                    "Early stopping! Loss has not improved after {} epochs!\n"
                    "Best result: {}; At epoch #{}".format(
                        self.patience, self.best_epoch_monitoring.best_value,
                        self.best_epoch_monitoring.best_epoch))

            # Else, check if current best has been reached
            # If that is the case, the monitor has just reset its n_bad_epochs
            # to 0
            if self.best_epoch_monitoring.n_bad_epochs == 0:
                self.logger.info("Best epoch yet! Saving model and loss "
                                 "history.")

                # Save model
                self.model.update_best_model()
                self.model.save(self.saving_path)

                # Save losses (i.e. mean over all batches)
                losses = {
                    'train_loss':
                    self.train_loss_monitor.epochs_means_history[
                        self.best_epoch_monitoring.best_epoch],
                    'valid_loss':
                    self.best_epoch_monitoring.best_value
                    if self.use_validation else None
                }
                with open(os.path.join(self.saving_path, "losses.json"),
                          'w') as json_file:
                    json_file.write(
                        json.dumps(losses, indent=4, separators=(',', ': ')))

                # Save information online
                if self.comet_exp:
                    self.comet_exp.log_metric(
                        "best_loss", self.best_epoch_monitoring.best_value)
                    self.comet_exp.log_metric(
                        "best_epoch", self.best_epoch_monitoring.best_epoch)

            # End of epoch, save checkpoint for resuming later
            self.save_checkpoint()

    def save_model(self):
        self.model.save(self.saving_path)

    def train_one_epoch(self, train_dataloader, train_context, epoch):
        """
        Train one epoch of the model: loop on all batches.

        All *args will be passed all to run_one_batch, which you should
        implement, in case you need some variables.
        """
        # Make sure there are no existing HDF handles if using parallel workers
        if (self.nb_cpu_processes > 0
                and self.train_batch_sampler.dataset.is_lazy):
            self.train_batch_sampler.dataset.hdf_handle = None
            self.train_batch_sampler.dataset.volume_cache_manager = None

        if self.comet_exp:
            self.comet_exp.log_metric("current_epoch", self.current_epoch)

        # Improving loggers for tqdm
        make_logger_tqdm_fitted(self.logger)
        make_logger_tqdm_fitted(self.model.logger)
        make_logger_tqdm_fitted(self.train_batch_sampler.logger)
        make_logger_tqdm_fitted(self.train_batch_loader.logger)
        if self.valid_batch_sampler:
            make_logger_tqdm_fitted(self.valid_batch_sampler.logger)
            make_logger_tqdm_fitted(self.valid_batch_loader.logger)

        # Training all batches
        self.logger.debug("Training one epoch: iterating on batches using "
                          "tqdm on the dataloader...")
        with tqdm(train_dataloader,
                  ncols=100,
                  total=self.nb_train_batches_per_epoch) as pbar:
            train_iterator = enumerate(pbar)
            with train_context():
                for batch_id, data in train_iterator:
                    # Break if maximum number of epochs has been reached
                    if batch_id == self.nb_train_batches_per_epoch:
                        # Explicitly close tqdm's progress bar to fix possible
                        # bugs when breaking the loop
                        pbar.close()
                        break

                    mean_loss, grad_norm = self.run_one_batch(
                        data,
                        is_training=True,
                        batch_loader=self.train_batch_loader)

                    self.logger.debug("Updated loss: {}".format(mean_loss))
                    self.train_loss_monitor.update(mean_loss)
                    self.grad_norm_monitor.update(grad_norm)

                    # Update information every 10 updates
                    if not self.use_validation and batch_id % 10 == 0:
                        self._update_logs(batch_id, mean_loss)

            # Explicitly delete iterator to kill threads and free memory before
            # running validation
            del train_iterator

        # Making loggers normal
        make_logger_normal(self.logger)
        make_logger_normal(self.model.logger)
        make_logger_normal(self.train_batch_sampler.logger)
        make_logger_normal(self.train_batch_loader.logger)
        if self.valid_batch_sampler:
            make_logger_normal(self.valid_batch_sampler.logger)
            make_logger_normal(self.valid_batch_loader.logger)

        # Saving epoch's information
        self.logger.info("Finishing epoch...")
        self.train_loss_monitor.end_epoch()
        self.grad_norm_monitor.end_epoch()
        self._save_log_from_array(self.train_loss_monitor.epochs_means_history,
                                  "train_loss.npy")
        self._save_log_from_array(self.grad_norm_monitor.epochs_means_history,
                                  "gradient_norm.npy")
        with train_context():
            if self.comet_exp:
                self.comet_exp.log_metric(
                    "gradient_norm_epoch",
                    self.grad_norm_monitor.epochs_means_history[-1],
                    step=epoch)
                self.comet_exp.log_metric(
                    "loss_epoch",
                    self.train_loss_monitor.epochs_means_history[-1],
                    step=epoch)

        self.logger.info("Mean gradient norm : {}".format(
            self.grad_norm_monitor.epochs_means_history[-1]))
        self.logger.info("Mean training loss : {}".format(
            self.train_loss_monitor.epochs_means_history[-1]))

    def validate_one_epoch(self, valid_dataloader, valid_context, epoch,
                           *args):
        """
        Validate one epoch of the model: loop on all batches.

        All *args will be passed all to run_one_batch, which you should
        implement, in case you need some variables.
        """
        self.logger.debug('Unused args in validate: {}'.format(args))

        # Make sure there are no existing HDF handles if using parallel workers
        if (self.nb_cpu_processes > 0
                and self.valid_batch_sampler.dataset.is_lazy):
            self.valid_batch_sampler.dataset.hdf_handle = None
            self.valid_batch_sampler.dataset.volume_cache_manager = None

        # Validate all batches
        with tqdm(valid_dataloader,
                  ncols=100,
                  total=self.nb_valid_batches_per_epoch) as pbar:
            valid_iterator = enumerate(pbar)
            for batch_id, data in valid_iterator:
                # Break if maximum number of epochs has been reached
                if batch_id == self.nb_valid_batches_per_epoch:
                    # Explicitly close tqdm's progress bar to fix possible bugs
                    # when breaking the loop
                    pbar.close()
                    break

                # Validate this batch: forward propagation + loss
                mean_loss, _ = self.run_one_batch(
                    data,
                    is_training=False,
                    batch_loader=self.valid_batch_loader)
                self.valid_loss_monitor.update(mean_loss)

                # Update information every 10 updates
                if batch_id % 10 == 0:
                    self._update_logs(batch_id, mean_loss)

            # Explicitly delete iterator to kill threads and free memory before
            # running training again
            del valid_iterator

        # Save this epoch's information
        self.valid_loss_monitor.end_epoch()
        self._save_log_from_array(self.valid_loss_monitor.epochs_means_history,
                                  "valid_loss.npy")
        with valid_context():
            if self.comet_exp:
                self.comet_exp.log_metric(
                    "loss_epoch",
                    self.valid_loss_monitor.epochs_means_history[-1],
                    step=epoch)
        self.logger.info("Validation loss : {}".format(
            self.valid_loss_monitor.epochs_means_history[-1]))

    def _update_logs(self, batch_id, mean_loss):
        if self.comet_exp:
            self.comet_exp.log_metric("loss_step", mean_loss, step=batch_id)
            self.comet_exp.log_metric(
                "gradient_norm_step",
                self.grad_norm_monitor.current_epoch_history[-1],
                step=batch_id)

    def run_one_batch(self, data, is_training: bool, batch_loader):
        """
        Run a batch of data through the model (calling its forward method)
        and return the mean loss. If training, run the backward method too.

        If the sampler was instantiated with wait_for_gpu, then we need to
        compute the inputs here; not done yet.

        Parameters
        ----------
        data : tuple of (List, dict)
            This is the output of the AbstractBatchLoader's load_batch()
            method. If wait_for_gpu, data is
            (batch_streamlines, final_streamline_ids_per_subj). Else, data is
            (batch_streamlines, final_streamline_ids_per_subj, inputs)
        batch_loader: AbstractBatchLoader
            Either self.train_batch_loader or valid_batch_loader, depending
            on the case.
        is_training : bool
            If True, record the computation graph and backprop through the
            model parameters.
        Returns
        -------
        mean_loss : float
            The mean loss of the provided batch
        grad_norm: float
            The total norm (sqrt(sum(params**2))) of parameters before gradient
            clipping, if any.
        """
        raise NotImplementedError

    def compute_loss(self, model_outputs, targets):
        """
        Calls the compute_loss method of the model. Reimplement in a child
        class if targets needs to be formatted in any way before the call.
        """
        mean_loss = self.model.compute_loss(model_outputs, targets)
        return mean_loss

    def fix_parameters(self):
        """
        This function is called during training, after the forward and
        backward propagation, but before updating the parameters through the
        optimizer. User may define their own functions here if some
        modification on the parameters is necessary.
        Ex: in the case of vanishing or exploding gradients problem, this would
        be the place to fix the parameters based on the gradient.
        """
        pass

    @classmethod
    def init_from_checkpoint(cls, model: MainModelAbstract, experiments_path,
                             experiment_name,
                             train_batch_sampler: DWIMLBatchSampler,
                             train_batch_loader: AbstractBatchLoader,
                             valid_batch_sampler: Union[DWIMLBatchSampler,
                                                        None],
                             valid_batch_loader: Union[AbstractBatchLoader,
                                                       None],
                             checkpoint_state: dict, new_patience,
                             new_max_epochs):
        """
        During save_checkpoint(), checkpoint_state.pkl is saved. Loading it
        back offers a dict that can be used to instantiate an experiment and
        set it at the same state as previously. (Current_epoch is updated +1).

        Hint: If you want to use this in your child class, use:
        experiment, checkpoint_state = super(cls, cls).init_from_checkpoint(...
        """
        trainer = cls(model,
                      experiments_path,
                      experiment_name,
                      batch_sampler_training=train_batch_sampler,
                      batch_loader_training=train_batch_loader,
                      batch_sampler_validation=valid_batch_sampler,
                      batch_loader_validation=valid_batch_loader,
                      from_checkpoint=True,
                      **checkpoint_state['params_for_init'])

        current_states = checkpoint_state['current_states']

        # Overriding values
        if new_patience:
            trainer.patience = new_patience
        if new_max_epochs:
            trainer.max_epochs = new_max_epochs

        # Set RNG states
        torch.set_rng_state(current_states['torch_rng_state'])
        trainer.train_batch_sampler.np_rng.set_state(
            current_states['numpy_rng_state'])
        if trainer.use_validation:
            trainer.valid_batch_sampler.np_rng.set_state(
                current_states['numpy_rng_state'])
        if trainer.use_gpu:
            torch.cuda.set_rng_state(current_states['torch_cuda_state'])

        # Set other objects
        trainer.comet_key = current_states['comet_key']
        trainer.current_epoch = current_states['current_epoch'] + 1
        trainer.nb_train_batches_per_epoch = \
            current_states['nb_train_batches_per_epoch']
        trainer.nb_valid_batches_per_epoch = \
            current_states['nb_valid_batches_per_epoch']
        trainer.best_epoch_monitoring.set_state(
            current_states['best_epoch_monitoring_state'])
        trainer.train_loss_monitor.set_state(
            current_states['train_loss_monitor_state'])
        trainer.valid_loss_monitor.set_state(
            current_states['valid_loss_monitor_state'])
        trainer.grad_norm_monitor.set_state(
            current_states['grad_norm_monitor_state'])
        trainer.optimizer.load_state_dict(current_states['optimizer_state'])

        logger.info(
            "Resuming from checkpoint! Next epoch will be epoch #{}".format(
                trainer.current_epoch))

        return trainer

    def save_checkpoint(self):
        """
        Save an experiment checkpoint that can be resumed from.
        """
        self.logger.info("Saving checkpoint...")

        # Make checkpoint directory
        checkpoint_dir = os.path.join(self.saving_path, "checkpoint")

        # Backup old checkpoint before saving, and erase it afterwards
        to_remove = None
        if os.path.exists(checkpoint_dir):
            to_remove = os.path.join(self.saving_path, "checkpoint_old")
            shutil.move(checkpoint_dir, to_remove)

        os.mkdir(checkpoint_dir)

        # Save experiment
        # Separated function to be re-implemented by child classes to fit your
        # needs. Below is one working example.
        checkpoint_state = self._prepare_checkpoint_state()
        torch.save(checkpoint_state,
                   os.path.join(checkpoint_dir, "checkpoint_state.pkl"))

        # Save model inside the checkpoint dir
        self.model.save(checkpoint_dir)

        if to_remove:
            shutil.rmtree(to_remove)

    def _prepare_checkpoint_state(self) -> dict:
        # These are parameters that should be updated after instantiating cls.
        current_states = {
            'comet_key':
            self.comet_key,
            'current_epoch':
            self.current_epoch,
            'nb_train_batches_per_epoch':
            self.nb_train_batches_per_epoch,
            'nb_valid_batches_per_epoch':
            self.nb_valid_batches_per_epoch,
            'torch_rng_state':
            torch.random.get_rng_state(),
            'torch_cuda_state':
            torch.cuda.get_rng_state() if self.use_gpu else None,
            'numpy_rng_state':
            self.train_batch_sampler.np_rng.get_state(),
            'best_epoch_monitoring_state':
            self.best_epoch_monitoring.get_state()
            if self.best_epoch_monitoring else None,
            'train_loss_monitor_state':
            self.train_loss_monitor.get_state(),
            'valid_loss_monitor_state':
            self.valid_loss_monitor.get_state(),
            'grad_norm_monitor_state':
            self.grad_norm_monitor.get_state(),
            'optimizer_state':
            self.optimizer.state_dict(),
        }

        # Additional params are the parameters necessary to load data, batch
        # samplers/loaders (see the example script dwiml_train_model.py).
        # Note that the training set and validation set attributes should be
        # the same in theory. #toDo to be checked?
        checkpoint_state = {
            'train_sampler_params': self.train_batch_sampler.params,
            'valid_sampler_params': None,
            'train_data_params': self.train_batch_sampler.dataset.params,
            'valid_data_params': None,
            'train_loader_params': self.train_batch_loader.params,
            'valid_loader_params': None,
            'params_for_init': self.params_for_checkpoint,
            'current_states': current_states
        }

        if self.use_validation:
            checkpoint_state.update({
                'valid_sampler_params':
                self.valid_batch_sampler.params,
                'valid_data_params':
                self.valid_batch_sampler.dataset.params,
                'valid_loader_params':
                self.valid_batch_loader.params
            })

        return checkpoint_state

    def _save_log_from_array(self, array: np.ndarray, fname: str):
        log_dir = os.path.join(self.saving_path, "logs")
        if not os.path.exists(log_dir):
            os.makedirs(log_dir)
        fpath = os.path.join(log_dir, fname)
        np.save(fpath, array)

    @staticmethod
    def load_params_from_checkpoint(experiments_path: str,
                                    experiment_name: str):
        total_path = os.path.join(experiments_path, experiment_name,
                                  "checkpoint", "checkpoint_state.pkl")
        if not os.path.isfile(total_path):
            raise FileNotFoundError(
                'Checkpoint was not found! ({})'.format(total_path))
        checkpoint_state = torch.load(total_path)

        return checkpoint_state

    @staticmethod
    def check_stopping_cause(checkpoint_state,
                             new_patience=None,
                             new_max_epochs=None):

        # 1. Check if early stopping had been triggered.
        best_monitoring_state = \
            checkpoint_state['current_states']['best_epoch_monitoring_state']
        bad_epochs = best_monitoring_state['n_bad_epochs']
        if new_patience is None:
            # No new patience: checking if early stopping had been triggered.
            if bad_epochs >= best_monitoring_state['patience']:
                raise EarlyStoppingError(
                    "Resumed experiment was stopped because of early "
                    "stopping, increase patience in order to resume training!")
        elif bad_epochs >= new_patience:
            # New patience: checking if will be able to continue
            raise EarlyStoppingError(
                "In resumed experiment, we had reach {} bad epochs (i.e. with "
                "no improvement). You have now overriden patience to {} but "
                "that won't be enough. Please increase that value in "
                "order to resume training.".format(
                    best_monitoring_state['n_bad_epochs'], new_patience))

        # 2. Checking that max_epochs had not been reached.
        current_epoch = checkpoint_state['current_states']['current_epoch']
        if new_max_epochs is None:
            if current_epoch == \
                    checkpoint_state['params_for_init']['max_epochs'] - 1:
                raise ValueError(
                    "Resumed experiment had stopped after reaching the "
                    "maximum number of epochs allowed (max_epochs = {}). "
                    "Please increase that value in order to resume training.".
                    format(checkpoint_state['params_for_init']['max_epochs']))
        else:
            if current_epoch > new_max_epochs:
                raise ValueError(
                    "In resumed experiment, we had performed {} epochs). You "
                    "have now overriden max_epoch to {} but that won't be "
                    "enough. Please increase that value in order to resume "
                    "training.".format(current_epoch, new_max_epochs))
Ejemplo n.º 19
0
    GMPS_PATH=/home/gberseth/playground/GMPS MULTIWORLD_PATH=/home/gberseth/playground/multiworld/ python3 functional_scripts/seq_train.py
"""

import sys
import os

GMPS_PATH = os.environ['GMPS_PATH']
MULTIWORL_PATH = os.environ['MULTIWORLD_PATH']
from comet_ml import Experiment

comet_logger = Experiment(api_key="KWwx7zh6I2uw6oQMkpEo3smu0",
                          project_name="ml4l3",
                          workspace="glenb")
comet_logger.set_name("test seq train with vpg")

print(comet_logger.get_key())

# comet_logger.end()

import tensorflow as tf
from functional_scripts.remote_train import experiment as train_experiment
from functional_scripts.local_test_ppo import experiment as rl_experiment

path_to_gmps = GMPS_PATH
test_dir = path_to_gmps + '/seq_test/'
meta_log_dir = test_dir + '/meta_data/'
EXPERT_DATA_LOC = test_dir + '/seq_expert_traj/'


def train_seq(meta_variant, rl_variant, comet_logger=comet_logger):
    from multiprocessing import Process
Ejemplo n.º 20
0
def main(args=None):
    # parse arguments
    if args is None:
        args = sys.argv[1:]
    args = parse_args(args)

    configs = configparser.ConfigParser()
    if args.config is not None:
        configs = read_config_file(args.config)

    if args.comet_api_key is not None:
        comet_experiment = Experiment(api_key=args.comet_api_key,
                                      project_name=args.comet_project_name, workspace=args.comet_workspace)
        comet_experiment.add_tag(args.experiment_tag)
        comet_experiment.set_name(args.experiment_tag)
        # get the experiment key from comet and replace the one passed throught the arguments
        args.experiment_key = comet_experiment.get_key()

        args_dict = vars(args)
        for arg_key, arg_val in args_dict.items():
            if isinstance(arg_val, argparse.Namespace):
                comet_experiment.log_parameters(vars(arg_val),arg_key)
            else:
                comet_experiment.log_parameter(arg_key, arg_val)
            # store the transformer configuration
            arg_key = 'init'
            comet_experiment.log_parameters(configs._sections['init'], arg_key)

    snapshot_path = helper.make_dir(os.path.join(args.snapshot_path, args.experiment_key))
    result_path = helper.make_dir(os.path.join(args.log_path, args.experiment_key))
    mfile = snapshot_path + 'transformer.h5'

    # store the args and configs
    helper.store_settings(store_object=args, json_file=result_path + 'script_arguments.args')
    write_config_file(configs, result_path + 'config.ini')

    train_generator = CSVGenerator(args.annotations, batch_size=args.batch_size, tokens_file=args.vocab,
                                   i_embedding_matrix_file=args.i_embedding_matrix, o_embedding_matrix_file=args.o_embedding_matrix,
                                   sequence_max_length=int(configs['init']['len_limit']))
    i_tokens = train_generator.i_tokens
    o_tokens = train_generator.o_tokens
    i_embedding_matrix = train_generator.i_embedding_matrix
    o_embedding_matrix = train_generator.o_embedding_matrix

    if args.val_annotations:
        validation_generator = CSVGenerator(args.val_annotations, batch_size=args.batch_size, i_tokens=i_tokens,
                                            o_tokens=o_tokens, sequence_max_length=int(configs['init']['len_limit']))
        val_size = validation_generator.size()
    else:
        validation_generator = None
        val_size = None

    if args.steps is not None:
        train_size = args.steps
    else:
        train_size = train_generator.size()

    print('seq 1 words:', i_tokens.num())
    print('seq 2 words:', o_tokens.num())

    s2s = Transformer(i_tokens, o_tokens, i_embedding_matrix=i_embedding_matrix, o_embedding_matrix=o_embedding_matrix,
                      **configs['init'])
    training_model = transformer(transformer_structure=s2s, inputs=None)
    lr_scheduler = LRSchedulerPerStep(configs['init']['d_model'], 4000)

    training_model.compile(
        metrics={'transformer_classification': metrics.masked_accuracy(layer_size=int(configs['init']['len_limit']))},
        loss={'transformer_classification': losses.masked_ce(layer_size=int(configs['init']['len_limit']))},
        optimizer=deserialize({'class_name': configs['optimizer']['class_name'],
                               'config':eval(configs['optimizer']['config'])}))

    model_saver = ModelCheckpoint(mfile, save_best_only=True, save_weights_only=True)
    csv_logger = CSVLogger(result_path + 'results.csv', append=True)

    training_model.summary()
    plot_model(training_model, to_file=snapshot_path + 'architecture.png', show_shapes=True, show_layer_names=True)

    try:
        training_model.load_weights(mfile)
    except:
        print('\n\nnew model')

    training_model.fit_generator(train_generator, epochs=args.epochs, shuffle=False, steps_per_epoch=train_size,
                                 callbacks=[lr_scheduler, model_saver, csv_logger],
                                 validation_data=validation_generator, validation_steps=val_size)
Ejemplo n.º 21
0
def main(args):
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)

    print('Loading data')
    data = np.load(args.boards_file, allow_pickle=True)
    idxs = data['idxs']
    labels = data['values'] 
    mask = labels != None
    idxs = idxs[mask]
    labels = labels[mask]
    n = len(idxs)

    if args.shuffle:
        perm = np.random.permutation(n)
        idxs = idxs[perm]
        labels = labels[perm]

    if args.experiment is None:
        experiment = Experiment(project_name="chess-axia")
        experiment.log_parameters(vars(args))
    else:
        experiment = ExistingExperiment(previous_experiment=args.experiment)
    key = experiment.get_key()

    print(f'Number of Boards: {n}')

    if torch.cuda.is_available() and args.num_gpus > 0:
        device = torch.device('cuda:0')
    else:
        device = torch.device('cpu')

    if args.num_train is None:
        args.num_train = n - args.num_test
    if args.num_train + args.num_test > n:
        raise ValueError('num-train and num-test sum to more than dataset size')
    train_idxs = idxs[:args.num_train]
    test_idxs = idxs[-args.num_test:]

    train_labels = labels[:-args.num_test]
    test_labels = labels[-args.num_test:]
    #print(f'Win percentage: {sum(train_labels)/ len(train_labels):.1%}')
    print('Train size: ' + str(len(train_labels)))

    train_loader = DataLoader(BoardAndPieces(train_idxs, train_labels),
                              batch_size=args.batch_size, collate_fn=collate_fn,
                              shuffle=True)
    test_loader = DataLoader(BoardAndPieces(test_idxs, test_labels),
                             batch_size=args.batch_size, collate_fn=collate_fn)

    ae = AutoEncoder().to(device)
    ae_file = append_to_modelname(args.ae_model, args.ae_iter)
    ae.load_state_dict(torch.load(ae_file))

    model = BoardValuator(ae).to(device)
    loss_fn = model.loss_fn
    model = DataParallel(model)
    if args.model_loadname:
        model.load_state_dict(torch.load(args.model_loadname))

    if args.ae_freeze:
        print('Freezing AE model')
        for param in ae.parameters():
            param.requires_grad = False

    if torch.cuda.device_count() > 1 and args.num_gpus > 1:
        model = torch.nn.DataParallel(model)

    optimizer = optim.Adam(model.parameters(), lr=args.lr)

    #cum_acc = cum_loss = count = 0
    total_iters = args.init_iter

    for epoch in range(args.init_epoch, args.epochs):
        print(f'Running epoch {epoch} / {args.epochs}\n')
        #for batch_idx, (input, mask, label) in tqdm(enumerate(train_loader),
        #                             total=len(train_loader)):
        for batch_idx, (input, mask, label) in enumerate(train_loader):

            model.train()

            input = to(input, device)
            mask = to(mask, device)
            label = to(label, device)

            optimizer.zero_grad()
            output = model(input, mask)
            loss = loss_fn(output, label)
            loss.backward()
            optimizer.step()

            cum_loss += loss.item()
            # cum_acc += acc.item()
            count += 1

            if total_iters % args.log_interval == 0:
                tqdm.write(f'Epoch: {epoch}\t Iter: {total_iters:>6}\t Loss: {loss.item():.5f}')
                # experiment.log_metric('accuracy', cum_acc / count,
                #                       step=total_iters)
                experiment.log_metric('loss', cum_loss / count,
                                      step=total_iters)
                experiment.log_metric('loss_', cum_loss / count,
                                      step=total_iters)
                #cum_acc = cum_loss = count = 0

            if total_iters % args.save_interval == 0:
                path = get_modelpath(args.model_dirname, key,
                                     args.model_savename, iter=total_iters,
                                     epoch=epoch)
                dirname = os.path.dirname(path)
                if not os.path.exists(dirname):
                    os.makedirs(dirname)
                torch.save(model.state_dict(), path)

            if total_iters % args.eval_interval == 0 and total_iters != 0:
                loss = eval_loss(model, test_loader, device, loss_fn)
                tqdm.write(f'\tTEST: Loss: {loss:.5f}')
                #experiment.log_metric('test accuracy', acc, step=total_iters,
                #                      epoch=epoch)
                experiment.log_metric('test loss', loss, step=total_iters,
                                      epoch=epoch)
            total_iters += 1
def main(datafile='./data/train_.pt',
         epochs=1000,
         learning_rate=1e-3,
         dim_out=10,
         device='cuda:0',
         project_name='em_showers_net_training',
         work_space='schattengenie',
         graph_embedder='GraphNN_KNN_v2',
         edge_classifier='EdgeClassifier_v1',
         patience=10):

    experiment = Experiment(project_name=project_name, workspace=work_space)

    early_stopping = EarlyStopping_(patience=patience, verbose=True)

    device = torch.device(device)
    showers = preprocess_dataset(datafile)
    showers_train, showers_test = train_test_split(showers, random_state=1337)

    train_loader = DataLoader(showers_train, batch_size=1, shuffle=True)
    test_loader = DataLoader(showers_test, batch_size=1, shuffle=True)

    k = showers[0].x.shape[1]
    print(k)
    graph_embedder = str_to_class(graph_embedder)(dim_out=dim_out,
                                                  k=k).to(device)
    edge_classifier = str_to_class(edge_classifier)(dim_out=dim_out).to(device)

    criterion = FocalLoss(gamma=2.)
    optimizer = torch.optim.Adam(list(graph_embedder.parameters()) +
                                 list(edge_classifier.parameters()),
                                 lr=learning_rate)

    loss_train = RunningAverageMeter()
    loss_test = RunningAverageMeter()
    roc_auc_test = RunningAverageMeter()
    pr_auc_test = RunningAverageMeter()
    acc_test = RunningAverageMeter()
    class_disbalance = RunningAverageMeter()

    for _ in tqdm(range(epochs)):
        for shower in train_loader:
            shower = shower.to(device)
            edge_labels_true, edge_labels_predicted = predict_one_shower(
                shower,
                graph_embedder=graph_embedder,
                edge_classifier=edge_classifier)
            # calculate the batch loss
            loss = criterion(edge_labels_predicted, edge_labels_true.float())
            # Zero gradients, perform a backward pass, and update the weights.
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            loss_train.update(loss.item())
            class_disbalance.update((edge_labels_true.sum().float() /
                                     len(edge_labels_true)).item())

        y_true_list = deque()
        y_pred_list = deque()
        for shower in test_loader:
            shower = shower.to(device)
            edge_labels_true, edge_labels_predicted = predict_one_shower(
                shower,
                graph_embedder=graph_embedder,
                edge_classifier=edge_classifier)

            # calculate the batch loss
            loss = criterion(edge_labels_predicted, edge_labels_true.float())
            y_true, y_pred = edge_labels_true.detach().cpu().numpy(
            ), edge_labels_predicted.detach().cpu().numpy()
            y_true_list.append(y_true)
            y_pred_list.append(y_pred)
            acc = accuracy_score(y_true, y_pred.round())
            roc_auc = roc_auc_score(y_true, y_pred)
            pr_auc = average_precision_score(y_true, y_pred)
            loss_test.update(loss.item())
            acc_test.update(acc)
            roc_auc_test.update(roc_auc)
            pr_auc_test.update(pr_auc)
            class_disbalance.update((edge_labels_true.sum().float() /
                                     len(edge_labels_true)).item())

        #f = plot_aucs(y_true=y_true, y_pred=y_pred)
        #experiment.log_figure("Optimization dynamic", f, overwrite=True)
        experiment_key = experiment.get_key()

        eval_loss = loss_test.val
        early_stopping(eval_loss, graph_embedder, edge_classifier,
                       experiment_key)

        ####
        if early_stopping.early_stop:
            print("Early stopping")
            break
        # TODO: save best
        #torch.save(graph_embedder.state_dict(), "graph_embedder_{}.pt".format(experiment_key))
        #torch.save(edge_classifier.state_dict(), "edge_classifier_{}.pt".format(experiment_key))

        experiment.log_metric('loss_test', loss_test.val)
        experiment.log_metric('acc_test', acc_test.val)
        experiment.log_metric('roc_auc_test', roc_auc_test.val)
        experiment.log_metric('pr_auc_test', pr_auc_test.val)
        experiment.log_metric('class_disbalance', class_disbalance.val)

        y_true = np.concatenate(y_true_list)
        y_pred = np.concatenate(y_pred_list)

    # load the last checkpoint with the best model
    graph_embedder.load_state_dict(
        torch.load("graph_embedder_{}.pt".format(experiment_key)))
    edge_classifier.load_state_dict(
        torch.load("edge_classifier_{}.pt".format(experiment_key)))
    return new_sources


tr_step = 0
val_step = 0
for i in range(hparams['n_epochs']):
    res_dic = {}
    histograms_dic = {}
    for loss_name in all_losses:
        res_dic[loss_name] = {'mean': 0., 'std': 0., 'acc': []}
        res_dic[loss_name+'i'] = {'mean': 0., 'std': 0., 'acc': []}
    for hist_name in histogram_names:
        histograms_dic[hist_name] = []
        histograms_dic[hist_name+'i'] = []
    print("Higher Order Sudo-RM-RF: {} - {} || Epoch: {}/{}".format(
        experiment.get_key(), experiment.get_tags(), i+1, hparams['n_epochs']))
    model.train()

    for data in tqdm(generators['train'], desc='Training'):
        opt.zero_grad()
        #m1wavs = data[0].cuda()
        clean_wavs = data[-1].cuda()

        if hparams['max_abs_snr'] > 0.:
            clean_wavs = mix_with_random_snr(clean_wavs, hparams['max_abs_snr'])

        histograms_dic['tr_input_snr'] += (10. * torch.log10(
            (clean_wavs[:, 0] ** 2).sum(-1) / (1e-8 + (
                    clean_wavs[:, 1] ** 2).sum(-1)))).tolist()

        # # Online mixing over samples of the batch. (This might cause to get
Ejemplo n.º 24
0
def main():
    # Training settings
    parser = argparse.ArgumentParser(description='Cifar10 Example')
    parser.add_argument('--batch-size', type=int, default=128, metavar='N', help='input batch size for training (default: 128)')
    parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)')
    parser.add_argument('--epochs', type=int, default=25, metavar='N', help='number of epochs to train (default: 25)')
    parser.add_argument('--lr', type=float, default=0.1, metavar='LR', help='learning rate (default: 0.1)')
    parser.add_argument('--momentum', type=float, default=0.9, metavar='M', help='SGD momentum (default: 0.9)')
    parser.add_argument('--model-path', type=str, default='', metavar='M', help='model param path')
    parser.add_argument('--loss-type', type=str, default='CE', metavar='L', help='B or CE or F or ICF_CE or ICF_F or CB_CE or CB_F')
    parser.add_argument('--beta', type=float, default=0.999, metavar='B', help='Beta for ClassBalancedLoss')
    parser.add_argument('--gamma', type=float, default=2.0, metavar='G', help='Gamma for FocalLoss')
    parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training')
    parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)')
    parser.add_argument('--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status')
    parser.add_argument('--balanced-data', action='store_true', default=False, help='For sampling rate. Default is Imbalanced-data.')
    parser.add_argument('--save-model', action='store_true', default=False, help='For Saving the current Model')
    args = parser.parse_args()

    # Add the following code anywhere in your machine learning file
    experiment = Experiment(api_key="5Yl3Rxz9S3E0PUKQTBpA0QJPi", project_name="imbalanced-cifar-10", workspace="tancoro")

    # ブラウザの実験ページを開く
    # experiment.display(clear=True, wait=True, new=0, autoraise=True)
    # 実験キー(実験を一意に特定するためのキー)の取得
    exp_key = experiment.get_key()
    print('KEY: ' + exp_key)
    # HyperParamの記録
    hyper_params = {
        'batch_size': args.batch_size,
        'epoch': args.epochs,
        'learning_rate': args.lr,
        'sgd_momentum' : args.momentum,
        'model_path' : args.model_path,
        'loss_type' : args.loss_type,
        'beta' : args.beta,
        'gamma' : args.gamma,
        'torch_manual_seed': args.seed,
        'balanced_data' : args.balanced_data
    }
    experiment.log_parameters(hyper_params)

    use_cuda = not args.no_cuda and torch.cuda.is_available()
    print('use_cuda {}'.format(use_cuda))

    torch.manual_seed(args.seed)
    device = torch.device("cuda" if use_cuda else "cpu")

    kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
    # train dataset
    cifar10_train_dataset = datasets.CIFAR10('./data', train=True, download=True,
                       transform=transforms.Compose([
                           transforms.RandomCrop(32, padding=4),
                           transforms.RandomHorizontalFlip(),
                           transforms.ToTensor(),
                           transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
                       ]))
    # train sampling rate
    sampling_rate = {}
    if not args.balanced_data:
        sampling_rate = {1:0.05, 4:0.05, 6:0.05}
    print(sampling_rate)
    # train Sampler
    train_sampler = ReductionSampler(cifar10_train_dataset, sampling_rate=sampling_rate)
    # train loader
    train_loader = torch.utils.data.DataLoader(cifar10_train_dataset,
        batch_size=args.batch_size, sampler=train_sampler, **kwargs)
    # test dataset
    cifar10_test_dataset = datasets.CIFAR10('./data', train=False, transform=transforms.Compose([
                           transforms.ToTensor(),
                           transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
                       ]))
    # test majority loader
    test_majority_sampler = ReductionSampler(cifar10_test_dataset, sampling_rate={1:0, 4:0, 6:0})
    test_majority_loader = torch.utils.data.DataLoader(cifar10_test_dataset,
        batch_size=args.test_batch_size, sampler=test_majority_sampler, **kwargs)
    # test minority loader
    test_minority_sampler = ReductionSampler(cifar10_test_dataset, sampling_rate={0:0, 2:0, 3:0, 5:0, 7:0, 8:0, 9:0})
    test_minority_loader = torch.utils.data.DataLoader(cifar10_test_dataset,
            batch_size=args.test_batch_size, sampler=test_minority_sampler, **kwargs)
    # test alldata loader
    test_alldata_loader = torch.utils.data.DataLoader(cifar10_test_dataset, batch_size=args.test_batch_size, shuffle=True, **kwargs)

    model = ResNet18().to(device)
    # train loss
    train_loss = BasicCrossEntropyLoss()
    if args.loss_type == 'CE':
        train_loss = CrossEntropyLoss(train_sampler.get_data_count_map(), device)
    elif args.loss_type == 'F':
        train_loss = FocalLoss(train_sampler.get_data_count_map(), device, gamma=args.gamma)
    elif args.loss_type == 'ICF_CE':
        train_loss = InverseClassFrequencyCrossEntropyLoss(train_sampler.get_data_count_map(), device)
    elif args.loss_type == 'ICF_F':
        train_loss = InverseClassFrequencyFocalLoss(train_sampler.get_data_count_map(), device, gamma=args.gamma)
    elif args.loss_type == 'CB_CE':
        train_loss = ClassBalancedCrossEntropyLoss(train_sampler.get_data_count_map(), device, beta=args.beta)
    elif args.loss_type == 'CB_F':
        train_loss = ClassBalancedFocalLoss(train_sampler.get_data_count_map(), device, beta=args.beta, gamma=args.gamma)
    print('Train Loss Type: {}'.format(type(train_loss)))

    # load param
    if len(args.model_path) > 0:
        model.load_state_dict(torch.load(args.model_path))

    optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=5e-4)
    # lr = 0.1 if epoch < 15
    # lr = 0.01 if 15 <= epoch < 20
    # lr = 0.001 if 20 <= epoch < 25
    scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[15,20], gamma=0.1)

    for epoch in range(1, args.epochs + 1):
        with experiment.train():
            experiment.log_current_epoch(epoch)
            train(args, model, device, train_loader, len(train_sampler), optimizer, epoch, experiment, lossfunc=train_loss)
        with experiment.test():
            test(args, model, device, test_minority_loader, len(test_minority_sampler), epoch, experiment, pref='minority')
            test(args, model, device, test_majority_loader, len(test_majority_sampler), epoch, experiment, pref='majority')
            test(args, model, device, test_alldata_loader, len(test_alldata_loader.dataset), epoch, experiment, pref='all')
        if (args.save_model) and (epoch % 10 == 0):
            print('saving model to ./model/cifar10_{0}_{1:04d}.pt'.format(exp_key, epoch))
            torch.save(model.state_dict(), "./model/cifar10_{0}_{1:04d}.pt".format(exp_key, epoch))
        scheduler.step()
Ejemplo n.º 25
0
def comet_lgbm(save_path):
    from comet_ml import Experiment
    exp = Experiment(api_key="sqMrI9jc8kzJYobRXRuptF5Tj",
                            project_name="baseline", workspace="gdreiman1")
    exp.log_code = True
    
    import pickle
    import pandas as pd
    import lightgbm as lgb
    import numpy as np
    import sklearn
    import matplotlib.pyplot as plt
    from sklearn.metrics import precision_recall_fscore_support as prf
    #%%
    def single_roc(y_preds,y_true):
        
        from sklearn.metrics import roc_curve, auc,precision_recall_curve
        fpr, tpr, _ = roc_curve(y_true, y_preds)
        roc_auc = auc(fpr, tpr)
        plt.figure()
        lw = 2
        plt.plot(fpr, tpr, color='darkorange',
                 lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
        plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver operating characteristic example')
        
        precision, recall, thresholds = precision_recall_curve(y_true, y_preds)
        plt.plot(recall, precision, color='blue',
                 lw=lw, label='Precision vs Recall')
        # show the plot
        plt.legend(loc="lower right")
        plt.show()
    def multi_roc(y_preds,y_true,name,n_classes):
        import collections
        nested_dict = lambda: collections.defaultdict(nested_dict)
        data_store = nested_dict()
        from sklearn.metrics import roc_curve, auc
        from scipy import interp
        from itertools import cycle
        lw = 2
        name_store = ['Active', 'Inactive', 'Inconclusive']
        fpr = dict()
        tpr = dict()
        roc_auc = dict()
        for i in range(n_classes):
            fpr[i], tpr[i], _ = roc_curve(y_true[:, i], y_preds[:, i])
            roc_auc[i] = auc(fpr[i], tpr[i])
        
        # Compute micro-average ROC curve and ROC area
        fpr["micro"], tpr["micro"], _ = roc_curve(y_true[:, i].ravel(), y_preds[:, i].ravel())
        roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
        # Compute macro-average ROC curve and ROC area
        
        # First aggregate all false positive rates
        all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
        
        # Then interpolate all ROC curves at this points
        mean_tpr = np.zeros_like(all_fpr)
        for i in range(n_classes):
            mean_tpr += interp(all_fpr, fpr[i], tpr[i])
        
        # Finally average it and compute AUC
        mean_tpr /= n_classes
        
        fpr["macro"] = all_fpr
        tpr["macro"] = mean_tpr
        roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
        
        # Plot all ROC curves
        plt.figure()
        plt.plot(fpr["micro"], tpr["micro"],
                 label='micro-average ROC curve (area = {0:0.2f})'
                       ''.format(roc_auc["micro"]),
                 color='deeppink', linestyle=':', linewidth=4)
        
        plt.plot(fpr["macro"], tpr["macro"],
                 label='macro-average ROC curve (area = {0:0.2f})'
                       ''.format(roc_auc["macro"]),
                 color='navy', linestyle=':', linewidth=4)
        
        colors = cycle(['aqua', 'darkorange', 'cornflowerblue','green'])
        for i, color in zip(range(n_classes), colors):
            plt.plot(fpr[i], tpr[i], color=color, lw=lw,
                     label='ROC curve of '+ name_store[i]+'(area = {1:0.2f})'
                     ''.format(i, roc_auc[i]))
        
        plt.plot([0, 1], [0, 1], 'k--', lw=lw)
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        #plt.title('Multi-class ROC for '+name+' Split= '+str(count+1))
        plt.title('Multi-class ROC for '+name)
    
        plt.legend(loc="lower right")
        #plt.show()
    #%%
    #save_path = r'C:\Users\gdrei\Dropbox\UCL\Thesis\May_13\AID_1345083_processed.pkl'
    model_type = 'lgbm'
    #get data cleaned
    pickle_off = open(save_path,'rb')
    activity_table=pickle.load(pickle_off)
    pickle_off.close()
    #get length of MFP
    fp_length = len(activity_table.iloc[5]['MFP'])
    
    
    from sklearn.preprocessing import StandardScaler, LabelEncoder
    scaler = StandardScaler(copy = False)
    le = LabelEncoder()
    labels = le.fit_transform(activity_table['PUBCHEM_ACTIVITY_OUTCOME'])
    #split data:
    from sklearn.model_selection import StratifiedShuffleSplit
    splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.5, train_size=None, random_state=2562)
    X_mfp = np.concatenate(np.array(activity_table['MFP'])).ravel()
    X_mfp = X_mfp.reshape((-1,fp_length))
    for train_ind, test_ind in splitter.split(X_mfp,labels):
        # standardize data
        X_train_molchars_std = scaler.fit_transform(np.array(activity_table.iloc[train_ind,4:]))
        X_test_molchars_std = scaler.transform(np.array(activity_table.iloc[test_ind,4:]))
        X_train = np.concatenate((X_mfp[train_ind,:],X_train_molchars_std),axis = 1)
        X_test = np.concatenate((X_mfp[test_ind,:],X_test_molchars_std),axis = 1)
        y_train = labels[train_ind]
        y_test = labels[test_ind]
        #X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X,labels,test_size = .5, shuffle = True, stratify = labels, random_state = 2562)
        bin_y_train, bin_y_test = [1 if x ==2 else x for x in y_train],[1 if x ==2 else x for x in y_test]
        
    #do light gbm
        
    #need to make a lib svm file
    train_data = lgb.Dataset(X_train,label=y_train)
    test_data = lgb.Dataset(X_test,label=y_test)
    #make model class
    lgbm_model = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=31, max_depth=-1, learning_rate=0.1, n_estimators=500, subsample_for_bin=200000, 
                                    objective='binary', is_unbalance=True, min_split_gain=0.0, min_child_weight=0.001, min_child_samples=20, subsample=1.0, 
                                    subsample_freq=0, colsample_bytree=1.0, reg_alpha=0.0, reg_lambda=0.0, random_state=None, n_jobs=-1, silent=True, 
                                    importance_type='split')
    #train model
    trained_mod = lgbm_model.fit(X_train,y_train)
    #predict classes and class_probs
    test_class_preds = lgbm_model.predict(X_test)
    test_prob_preds = lgbm_model.predict_proba(X_test)
    #calculate Class report
    class_rep = sklearn.metrics.classification_report(y_test,test_class_preds)
    
    print(class_rep)
    if len(set(y_test)) == 2:
        single_roc(test_prob_preds[:,1],y_test)
        prec,rec,f_1,supp = prf(y_test, test_class_preds, average=None)
    else:
        from tensorflow.keras.utils import to_categorical
        multi_roc(test_prob_preds,to_categorical(y_test),'',3)
        prec,rec,f_1,supp = prf(y_test, test_class_preds, average=None)
    
    
     #%% 
    '''Comet Saving Zone'''
    #get AID number
    import ntpath
    #get base file name
    folder,base = ntpath.split(save_path)
    #split file name at second _ assumes file save in AID_xxx_endinfo.pkl
    AID, _,end_info = base.rpartition('_')
    #save data location, AID info, and version info
    exp.log_dataset_info(name = AID, version = end_info, path = save_path)
    #save model params
    exp.log_parameters(trained_mod.get_params())
    #save metrics report to comet
    if len(f_1) == 2:
        for i,name in enumerate(['Active','Inactive']):
            exp.log_metric('f1 class '+name, f_1[i])
            exp.log_metric('Recall class'+name,rec[i])
            exp.log_metric('Precision class'+name, prec[i])
    else:
        for i,name in enumerate(['Active','Inconclusive','Inactive']):
            exp.log_metric('f1 class '+str(i), f_1[i])
            exp.log_metric('Recall class'+str(i),rec[i])
            exp.log_metric('Precision class'+str(i), prec[i])
        #exp.log_metric('f1 class '+str(i), f_1[i])
        #exp.log_metric('Recall class'+str(i),rec[i])
        #exp.log_metric('Precision class'+str(i), prec[i])
    exp.log_other('Classification Report',class_rep)
     #save model in data_folder with comet experiement number associated
    exp_num = exp.get_key()
    model_save = folder+'\\'+model_type+'_'+exp_num+'.pkl'
    pickle_on = open(model_save,'wb')
    pickle.dump(trained_mod,pickle_on)
    pickle_on.close()
    #log trained model location
    exp.log_other('Trained Model Path',model_save)
    #save some informatvie tags:
    tags = [AID,end_info,model_type]
    exp.add_tags(tags)
    #save ROC curve
    exp.log_figure(figure_name = 'ROC-Pres/Recall',figure=plt)
    plt.show()

    #tell comet that the experiement is over
    exp.end()
Ejemplo n.º 26
0
def train_cifar10(batch_size: int,
                  learning_rate: float,
                  epochs: int,
                  experiment: Experiment,
                  model: Sequential = get_model(),
                  initial_epoch: int = 0,
                  training_datagen: ImageDataGenerator = ImageDataGenerator(),
                  scheduler: Callable[[int], float] = None,
                  early_stopping_th: Optional[int] = 250,
                  data_portion: float = 1.0,
                  find_lr: bool = False) -> None:
    preprocessing_fnc = training_datagen.preprocessing_function
    name = experiment.get_key()
    log_path, model_path = get_output_paths(name)
    data = get_cifar10_data(data_portion=data_portion)

    training_datagen.fit(data.x_train)
    log_images(data.x_train, training_datagen, experiment)
    log_input_images(data.x_train, data.y_train, training_datagen, experiment)

    opt = Adam(lr=learning_rate)
    model.compile(loss='categorical_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy'])

    log_model_plot(experiment, model)

    csv_cb = CSVLogger(log_path)
    keep_best_cb = KeepBest('val_acc')
    callbacks = [csv_cb,
                 keep_best_cb]  # [csv_cb, early_stopping_cb, keep_best_cb]
    if early_stopping_th is not None:
        early_stopping_cb = EarlyStopping('val_acc',
                                          patience=early_stopping_th,
                                          restore_best_weights=True,
                                          verbose=2)
        callbacks.append(early_stopping_cb)
    if scheduler is not None:
        scheduler.experiment_log(experiment=experiment,
                                 epochs=list(range(epochs)))
        callbacks.append(LearningRateScheduler(scheduler))
    if find_lr:
        lrf = LearningRateFinder(model=model)
        lrf.lrMult = (10e-1 / learning_rate)**(
            1.0 / (epochs * len(data.x_train) / batch_size))
        callbacks = [
            LambdaCallback(
                on_batch_end=lambda batch, logs: lrf.on_batch_end(batch, logs))
        ]

    model.fit_generator(training_datagen.flow(data.x_train,
                                              data.y_train,
                                              batch_size=batch_size),
                        steps_per_epoch=len(data.x_train) / batch_size,
                        epochs=epochs,
                        validation_data=(preprocessing_fnc(data.x_dev),
                                         data.y_dev),
                        shuffle=True,
                        callbacks=callbacks,
                        verbose=2,
                        initial_epoch=initial_epoch)
    model.save(model_path)
    experiment.log_asset(model_path)
    experiment.log_asset(log_path)

    if find_lr:
        experiment.log_figure('lr vs acc', lrf.plot_loss())

    log_final_metrics(experiment, model, data, preprocessing_fnc)
Ejemplo n.º 27
0
        }
        metrics['step_time'] = step_time

        # validation plotting
        progbar.add(valid_inc, [('Train Loss', metrics['train_loss']),
                                ('Validation Loss', metrics['valid_loss']),
                                ('Time (s)', step_time)])
        #Plot on Comet
        experiment.log_metrics(metrics, step=t)
        # Plot on WandB
        wandb.log(metrics, step=t)

    if (t + 1) % save_inc == 0:
        trainer.save_weights(model_path,
                             run_id=wandb.run.id,
                             experiment_key=experiment.get_key())
        if not args.gcbc and not args.images:
            z_enc, z_plan = produce_cluster_fig(next(plotting_dataset),
                                                encoder,
                                                planner,
                                                TEST_DATA_PATHS[0],
                                                num_take=dl.batch_size // 4)

            #Comet
            experiment.log_figure('z_enc', z_enc, step=t)
            experiment.log_figure('z_plan', z_plan, step=t)

            # WandB
            wandb.log({'z_enc': z_enc, 'z_plan': z_plan}, step=t)

            #latent_fig = project_enc_and_plan(ze, zp)
Ejemplo n.º 28
0
home = os.environ['HOME']

parser = argparse.ArgumentParser()
parser.add_argument('-span', default=.5, type=float)
parser.add_argument('-seed', default=1234, type=int)
parser.add_argument('-eig', action='store_true')
parser.add_argument('-ckpt', default='poison-filtnorm-weaker', type=str)
parser.add_argument('-gpu', default='0', type=str)
parser.add_argument('-svhn', action='store_true')
args = parser.parse_args()

# comet stuff
if not os.path.exists('comet_expt_key_surface.txt'):
  experiment = Experiment(api_key="vPCPPZrcrUBitgoQkvzxdsh9k", parse_args=False,
                          project_name='landscape', workspace="wronnyhuang")
  open('comet_expt_key_surface.txt', 'w+').write(experiment.get_key())
else:
  comet_key = open('comet_expt_key_surface.txt', 'r').read()
  experiment = ExistingExperiment(api_key="vPCPPZrcrUBitgoQkvzxdsh9k", previous_experiment=comet_key, parse_args=False)

# apply settings
np.random.seed(args.seed)
os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

# load data and model
cleanloader, _, _ = get_loader(join(home, 'datasets'), batchsize=2 * 64, fracdirty=.5, nogan=True, svhn=args.svhn)
evaluator = Evaluator(cleanloader)
evaluator.restore_weights_dropbox('ckpt/'+args.ckpt)

# plot along which direction
if args.eig:
Ejemplo n.º 29
0
        max=50.)

    errors = torch.cat([er_00 + er_01, er_10 + er_11], 1)
    return torch.mean(torch.min(errors, 1)[0])


num_available_nodes = len(federated_generators_list)
tr_step = 0
val_step = 0
prev_epoch_val_loss = 0.
for i in range(hparams['n_global_epochs']):
    res_dic = {}
    for loss_name in all_losses:
        res_dic[loss_name] = {'mean': 0., 'std': 0., 'median': 0., 'acc': []}
    print("Individual Federated Sudo-RM-RF: {} - {} || Epoch: {}/{}".format(
        experiment.get_key(), experiment.get_tags(), i + 1,
        hparams['n_global_epochs']))

    training_nodes = federated_generators_list
    sum_global_loss = 0.

    for train_node_id, node_dic in enumerate(training_nodes):
        local_model = node_dic['local_model']
        local_model = local_model.cuda()
        local_model.train()
        local_opt = torch.optim.Adam(local_model.parameters(),
                                     lr=hparams['learning_rate'])
        if hparams['patience'] > 0:
            if tr_step % hparams['patience'] == 0:
                new_lr = (hparams['learning_rate'] /
                          (hparams['divide_lr_by']
Ejemplo n.º 30
0
def main(args=None):
    # parse arguments
    if args is None:
        args = sys.argv[1:]
    args = parse_args(args)

    # create object that stores backbone information
    backbone = models.backbone(args.backbone)

    # make sure keras is the minimum required version
    check_keras_version()

    # optionally choose specific GPU
    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
    keras.backend.tensorflow_backend.set_session(get_session())

    # optionally load config parameters
    if args.config:
        args.config = read_config_file(args.config)

    if args.comet_api_key is not None:
        comet_experiment = Experiment(api_key=args.comet_api_key,
                                      project_name=args.comet_project_name,
                                      workspace=args.comet_workspace)
        comet_experiment.add_tag(args.experiment_tag)
        comet_experiment.set_name(args.experiment_tag)
        # get the experiment key from comet and replace the one passed through the arguments
        args.experiment_key = comet_experiment.get_key()

    # modify the snapshot path to include the experiment key
    args.snapshot_path = make_dir(
        os.path.join(args.snapshot_path, args.experiment_key))

    # create the generators
    train_generator, validation_generator = create_generators(
        args, backbone.preprocess_image)

    # create the model
    if args.snapshot is not None:
        print('Loading model, this may take a second...')
        model = models.load_model(args.snapshot, backbone_name=args.backbone)
        training_model = model
        anchor_params = None
        if args.config and 'anchor_parameters' in args.config:
            anchor_params = parse_anchor_parameters(args.config)
        prediction_model = retinanet_bbox(model=model,
                                          anchor_params=anchor_params)
    else:
        weights = args.weights
        # default to imagenet if nothing else is specified
        if weights is None and args.imagenet_weights:
            weights = backbone.download_imagenet()

        print('Creating model, this may take a second...')
        model, training_model, prediction_model = create_models(
            backbone_retinanet=backbone.retinanet,
            num_classes=train_generator.num_classes(),
            weights=weights,
            multi_gpu=args.multi_gpu,
            freeze_backbone=args.freeze_backbone,
            config=args.config)

    # print model summary
    print(model.summary())

    # this lets the generator compute backbone layer shapes using the actual backbone model
    if 'vgg' in args.backbone or 'densenet' in args.backbone:
        train_generator.compute_shapes = make_shapes_callback(model)
        if validation_generator:
            validation_generator.compute_shapes = train_generator.compute_shapes

    # create the callbacks
    callbacks = create_callbacks(
        model,
        training_model,
        prediction_model,
        validation_generator,
        args,
    )

    # start training
    training_model.fit_generator(
        generator=train_generator,
        steps_per_epoch=args.steps,
        epochs=args.epochs,
        verbose=1,
        callbacks=callbacks,
    )