def __init__(self,
                 paml_path,
                 load_path,
                 node_dim,
                 embedding_dim=128,
                 n_encode_layers=3):
        '''Initialize model.'''
        self.model = AttentionModel(node_dim=node_dim,
                                    embedding_dim=embedding_dim,
                                    n_encode_layers=n_encode_layers)

        print('  [*] Loading data from {}'.format(load_path))
        load_data = torch_load_cpu(load_path)
        self.model.load_state_dict(load_data['model'])
        self.model.set_decode_type('greedy')

        if paml_path is None:
            self.paml = None
        else:
            self.paml = PAMLml(paml_command=paml_path, del_tmp_dir=True)
def infer_tree_rl(model_path,
                  node_dim,
                  embedding_dim,
                  n_encode_layers,
                  testset_dir,
                  num_testset,
                  size_testset,
                  k_mer=6,
                  out_filename=''):
    model = AttentionModel(node_dim=node_dim,
                           embedding_dim=embedding_dim,
                           n_encode_layers=n_encode_layers)
    print('  [*] Loading data from {}'.format(model_path))
    load_data = torch_load_cpu(model_path)
    model.load_state_dict(load_data['model'])
    # del load_data
    model.set_decode_type('greedy')

    for i in range(num_testset):
        print('testset[{}] Inferring tree by rl ...'.format(i + 1))
        for j in range(size_testset):
            msa_file = os.path.join(testset_dir, str(i + 1),
                                    '%d_rn.fasta' % (j + 1))
            out_file = os.path.join(testset_dir, str(i + 1),
                                    '%d_rl%s.nwk' % (j + 1, out_filename))
            with open(msa_file, 'rt') as f_read:
                seq_list = [seq for _, seq in SimpleFastaParser(f_read)]
            vecs = [get_kmer(seq, k_mer=k_mer) for seq in seq_list]
            vecs_tensor = torch.tensor(vecs, dtype=torch.float).unsqueeze(0)

            _, _, circular_orders = model(vecs_tensor, return_pi=True)
            co = circular_orders.tolist()[0]

            tree = co2tree(vecs, co)
            # tree = cluster_co2tree(vecs, co)
            TreeIO.write([tree], out_file, 'newick')
Beispiel #3
0
def _run_rl(opts):

    # Pretty print the run args
    pp.pprint(vars(opts))

    # Set the random seed
    torch.manual_seed(opts.seed)

    # Optionally configure tensorboard
    tb_logger = None
    if not opts.no_tensorboard:
        tb_logger = TbLogger(
            os.path.join(opts.log_dir, "{}_{}".format(opts.problem,
                                                      opts.graph_size),
                         opts.run_name))

    os.makedirs(opts.save_dir)
    # Save arguments so exact configuration can always be found
    with open(os.path.join(opts.save_dir, "args.json"), 'w') as f:
        json.dump(vars(opts), f, indent=True)

    # Set the device
    opts.device = torch.device("cuda:0" if opts.use_cuda else "cpu")

    # Figure out what's the problem
    problem = load_problem(opts.problem)

    # Load data from load_path
    load_data = {}
    assert opts.load_path is None or opts.resume is None, "Only one of load path and resume can be given"
    load_path = opts.load_path if opts.load_path is not None else opts.resume
    if load_path is not None:
        print('  [*] Loading data from {}'.format(load_path))
        load_data = torch_load_cpu(load_path)

    # Initialize model
    model_class = {
        'attention': AttentionModel,
        'pointer': PointerNetwork
    }.get(opts.model, None)
    assert model_class is not None, "Unknown model: {}".format(model_class)
    encoder_class = {
        'gat': GraphAttentionEncoder,
        'gcn': GCNEncoder,
        'mlp': MLPEncoder
    }.get(opts.encoder, None)
    assert encoder_class is not None, "Unknown encoder: {}".format(
        encoder_class)
    model = model_class(opts.embedding_dim,
                        opts.hidden_dim,
                        problem,
                        encoder_class,
                        n_encode_layers=opts.n_encode_layers,
                        mask_inner=True,
                        mask_logits=True,
                        normalization=opts.normalization,
                        tanh_clipping=opts.tanh_clipping,
                        checkpoint_encoder=opts.checkpoint_encoder,
                        shrink_size=opts.shrink_size).to(opts.device)

    if opts.use_cuda and torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model)

    # Compute number of network parameters
    print(model)
    nb_param = 0
    for param in model.parameters():
        nb_param += np.prod(list(param.data.size()))
    print('Number of parameters: ', nb_param)

    # Overwrite model parameters by parameters to load
    model_ = get_inner_model(model)
    model_.load_state_dict({
        **model_.state_dict(),
        **load_data.get('model', {})
    })

    # Initialize baseline
    if opts.baseline == 'exponential':
        baseline = ExponentialBaseline(opts.exp_beta)
    elif opts.baseline == 'critic' or opts.baseline == 'critic_lstm':
        assert problem.NAME == 'tsp', "Critic only supported for TSP"
        baseline = CriticBaseline(
            (CriticNetworkLSTM(2, opts.embedding_dim, opts.hidden_dim,
                               opts.n_encode_layers, opts.tanh_clipping)
             if opts.baseline == 'critic_lstm' else CriticNetwork(
                 encoder_class, 2, opts.embedding_dim, opts.hidden_dim,
                 opts.n_encode_layers, opts.normalization)).to(opts.device))
    elif opts.baseline == 'rollout':
        baseline = RolloutBaseline(model, problem, opts)
    else:
        assert opts.baseline is None, "Unknown baseline: {}".format(
            opts.baseline)
        baseline = NoBaseline()

    if opts.bl_warmup_epochs > 0:
        baseline = WarmupBaseline(baseline,
                                  opts.bl_warmup_epochs,
                                  warmup_exp_beta=opts.exp_beta)

    # Load baseline from data, make sure script is called with same type of baseline
    if 'baseline' in load_data:
        baseline.load_state_dict(load_data['baseline'])

    # Initialize optimizer
    optimizer = optim.Adam([{
        'params': model.parameters(),
        'lr': opts.lr_model
    }] + ([{
        'params': baseline.get_learnable_parameters(),
        'lr': opts.lr_critic
    }] if len(baseline.get_learnable_parameters()) > 0 else []))

    # Load optimizer state
    if 'optimizer' in load_data:
        optimizer.load_state_dict(load_data['optimizer'])
        for state in optimizer.state.values():
            for k, v in state.items():
                # if isinstance(v, torch.Tensor):
                if torch.is_tensor(v):
                    state[k] = v.to(opts.device)

    # Initialize learning rate scheduler, decay by lr_decay once per epoch!
    lr_scheduler = optim.lr_scheduler.LambdaLR(
        optimizer, lambda epoch: opts.lr_decay**epoch)

    # Start the actual training loop
    val_dataset = problem.make_dataset(size=opts.graph_size,
                                       num_samples=opts.val_size,
                                       filename=opts.val_dataset)
    opts.val_size = val_dataset.size

    if opts.resume:
        epoch_resume = int(
            os.path.splitext(os.path.split(opts.resume)[-1])[0].split("-")[1])

        torch.set_rng_state(load_data['rng_state'])
        if opts.use_cuda:
            torch.cuda.set_rng_state_all(load_data['cuda_rng_state'])
        # Set the random states
        # Dumping of state was done before epoch callback, so do that now (model is loaded)
        baseline.epoch_callback(model, epoch_resume)
        print("Resuming after {}".format(epoch_resume))
        opts.epoch_start = epoch_resume + 1

    if opts.eval_only:
        validate(model, val_dataset, opts)
    else:
        for epoch in range(opts.epoch_start, opts.epoch_start + opts.n_epochs):
            train_epoch(model, optimizer, baseline, lr_scheduler, epoch,
                        val_dataset, problem, tb_logger, opts)
Beispiel #4
0
def _run_sl(opts):

    # Pretty print the run args
    pp.pprint(vars(opts))

    # Set the random seed
    torch.manual_seed(opts.seed)

    # Optionally configure tensorboard
    tb_logger = None
    if not opts.no_tensorboard:
        tb_logger = TbLogger(
            os.path.join(opts.log_dir, "{}_{}".format(opts.problem,
                                                      opts.graph_size),
                         opts.run_name))

    os.makedirs(opts.save_dir)
    # Save arguments so exact configuration can always be found
    with open(os.path.join(opts.save_dir, "args.json"), 'w') as f:
        json.dump(vars(opts), f, indent=True)

    # Set the device
    opts.device = torch.device("cuda:0" if opts.use_cuda else "cpu")

    # Figure out what's the problem
    problem = load_problem(opts.problem)

    assert opts.problem == 'tspsl', "Only TSP is supported for supervised learning"

    # Load data from load_path
    load_data = {}
    assert opts.load_path is None or opts.resume is None, "Only one of load path and resume can be given"
    load_path = opts.load_path if opts.load_path is not None else opts.resume
    if load_path is not None:
        print('  [*] Loading data from {}'.format(load_path))
        load_data = torch_load_cpu(load_path)

    # Initialize model
    model_class = {'attention': AttentionModel}.get(opts.model, None)
    assert model_class is not None, "Unknown model: {}".format(model_class)
    encoder_class = {
        'gat': GraphAttentionEncoder,
        'gcn': GCNEncoder,
        'mlp': MLPEncoder
    }.get(opts.encoder, None)
    assert encoder_class is not None, "Unknown encoder: {}".format(
        encoder_class)
    model = model_class(opts.embedding_dim,
                        opts.hidden_dim,
                        problem,
                        encoder_class,
                        n_encode_layers=opts.n_encode_layers,
                        mask_inner=True,
                        mask_logits=True,
                        normalization=opts.normalization,
                        tanh_clipping=opts.tanh_clipping,
                        checkpoint_encoder=opts.checkpoint_encoder,
                        shrink_size=opts.shrink_size,
                        use_cuda=opts.use_cuda).to(opts.device)

    if opts.use_cuda and torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model)

    # Compute number of network parameters
    print(model)
    nb_param = 0
    for param in model.parameters():
        nb_param += np.prod(list(param.data.size()))
    print('Number of parameters: ', nb_param)

    # Overwrite model parameters by parameters to load
    model_ = get_inner_model(model)
    model_.load_state_dict({
        **model_.state_dict(),
        **load_data.get('model', {})
    })

    # Initialize optimizer
    optimizer = optim.Adam([{
        'params': model.parameters(),
        'lr': opts.lr_model
    }])

    # Load optimizer state
    if 'optimizer' in load_data:
        optimizer.load_state_dict(load_data['optimizer'])
        for state in optimizer.state.values():
            for k, v in state.items():
                # if isinstance(v, torch.Tensor):
                if torch.is_tensor(v):
                    state[k] = v.to(opts.device)

    # Initialize learning rate scheduler, decay by lr_decay once per epoch!
    lr_scheduler = optim.lr_scheduler.LambdaLR(
        optimizer, lambda epoch: opts.lr_decay**epoch)

    # Start the actual training loop
    train_dataset = problem.make_dataset(size=opts.graph_size,
                                         filename=opts.train_dataset)
    opts.epoch_size = train_dataset.size
    val_dataset = problem.make_dataset(size=opts.graph_size,
                                       filename=opts.val_dataset)
    opts.val_size = val_dataset.size

    if opts.resume:
        epoch_resume = int(
            os.path.splitext(os.path.split(opts.resume)[-1])[0].split("-")[1])

        torch.set_rng_state(load_data['rng_state'])
        if opts.use_cuda:
            torch.cuda.set_rng_state_all(load_data['cuda_rng_state'])
        # Set the random states
        print("Resuming after {}".format(epoch_resume))
        opts.epoch_start = epoch_resume + 1

    if opts.eval_only:
        validate(model, val_dataset, opts)
    else:
        for epoch in range(opts.epoch_start, opts.epoch_start + opts.n_epochs):
            train_epoch_sl(model, optimizer, lr_scheduler, epoch,
                           train_dataset, val_dataset, problem, tb_logger,
                           opts)
Beispiel #5
0
def run(opts):

    # Pretty print the run args
    pp.pprint(vars(opts))

    # Set the random seed
    torch.manual_seed(opts.seed)

    # Optionally configure tensorboard
    tb_logger = None
    if not opts.no_tensorboard:
        tb_logger = TbLogger(
            os.path.join(opts.log_dir, "{}_{}".format(opts.problem,
                                                      opts.graph_size),
                         opts.run_name))

    os.makedirs(opts.save_dir)
    # Save arguments so exact configuration can always be found
    with open(os.path.join(opts.save_dir, "args.json"), 'w') as f:
        json.dump(vars(opts), f, indent=True)

    # Set the device
    opts.device = torch.device("cuda:0" if opts.use_cuda else "cpu")

    # Figure out what's the problem
    problem = load_problem(opts.problem)

    # Load data from load_path
    load_data = {}
    assert opts.load_path is None or opts.resume is None, "Only one of load path and resume can be given"
    load_path = opts.load_path if opts.load_path is not None else opts.resume
    if load_path is not None:
        print('  [*] Loading data from {}'.format(load_path))
        load_data = torch_load_cpu(load_path)

    # Initialize model
    model_class = {
        'attention': AttentionModel,
        'pointer': PointerNetwork
    }.get(opts.model, None)
    assert model_class is not None, "Unknown model: {}".format(model_class)
    model = model_class(opts.embedding_dim,
                        opts.hidden_dim,
                        problem,
                        n_encode_layers=opts.n_encode_layers,
                        mask_inner=True,
                        mask_logits=True,
                        normalization=opts.normalization,
                        tanh_clipping=opts.tanh_clipping,
                        checkpoint_encoder=opts.checkpoint_encoder,
                        shrink_size=opts.shrink_size,
                        steps=opts.awe_steps,
                        graph_size=opts.graph_size).to(opts.device)

    if opts.use_cuda and torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model)

    # Overwrite model parameters by parameters to load
    model_ = get_inner_model(model)
    model_.load_state_dict({
        **model_.state_dict(),
        **load_data.get('model', {})
    })

    # Initialize baseline
    if opts.baseline == 'exponential':
        baseline = ExponentialBaseline(opts.exp_beta)
    elif opts.baseline == 'constant':
        baseline = ConstantBaseline()
    elif opts.baseline == 'critic' or opts.baseline == 'critic_lstm':
        assert problem.NAME == 'tsp', "Critic only supported for TSP"
        baseline = CriticBaseline(
            (CriticNetworkLSTM(2, opts.embedding_dim, opts.hidden_dim,
                               opts.n_encode_layers, opts.tanh_clipping)
             if opts.baseline == 'critic_lstm' else CriticNetwork(
                 2, opts.embedding_dim, opts.hidden_dim, opts.n_encode_layers,
                 opts.normalization)).to(opts.device))
    elif opts.baseline == 'rollout':
        baseline = RolloutBaseline(model, problem, opts)
    elif opts.baseline == 'critic_lp':
        assert problem.NAME == 'lp'
        dim_vocab = {2: 2, 3: 5, 4: 15, 5: 52, 6: 203, 7: 877, 8: 4140}
        baseline = CriticBaseline(
            (CriticNetworkLP(dim_vocab[opts.awe_steps], opts.embedding_dim,
                             opts.hidden_dim, opts.n_encode_layers,
                             opts.normalization)).to(opts.device))
    else:
        assert opts.baseline is None, "Unknown baseline: {}".format(
            opts.baseline)
        baseline = NoBaseline()

    if opts.bl_warmup_epochs > 0:
        baseline = WarmupBaseline(baseline,
                                  opts.bl_warmup_epochs,
                                  warmup_exp_beta=opts.exp_beta)

    # Load baseline from data, make sure script is called with same type of baseline
    if 'baseline' in load_data:
        baseline.load_state_dict(load_data['baseline'])

    # Initialize optimizer
    optimizer = optim.Adam([{
        'params': model.parameters(),
        'lr': opts.lr_model
    }] + ([{
        'params': baseline.get_learnable_parameters(),
        'lr': opts.lr_critic
    }] if len(baseline.get_learnable_parameters()) > 0 else []))

    # Load optimizer state
    if 'optimizer' in load_data:
        optimizer.load_state_dict(load_data['optimizer'])
        for state in optimizer.state.values():
            for k, v in state.items():
                # if isinstance(v, torch.Tensor):
                if torch.is_tensor(v):
                    state[k] = v.to(opts.device)

    # Initialize learning rate scheduler, decay by lr_decay once per epoch!
    lr_scheduler = optim.lr_scheduler.LambdaLR(
        optimizer, lambda epoch: opts.lr_decay**epoch)

    # Start the actual training loop
    val_dataset = problem.make_dataset(num_samples=opts.val_size,
                                       filename=opts.val_dataset,
                                       distribution=opts.data_distribution,
                                       size=opts.graph_size,
                                       degree=opts.degree,
                                       steps=opts.awe_steps,
                                       awe_samples=opts.awe_samples)

    if opts.resume:
        epoch_resume = int(
            os.path.splitext(os.path.split(opts.resume)[-1])[0].split("-")[1])

        torch.set_rng_state(load_data['rng_state'])
        if opts.use_cuda:
            torch.cuda.set_rng_state_all(load_data['cuda_rng_state'])
        # Set the random states
        # Dumping of state was done before epoch callback, so do that now (model is loaded)
        baseline.epoch_callback(model, epoch_resume)
        print("Resuming after {}".format(epoch_resume))
        opts.epoch_start = epoch_resume + 1

    if opts.eval_only:
        validate(model, val_dataset, opts)
    else:
        extra = {'updates': 0, 'avg_reward': 10**8, "best_epoch": -1}
        start = time.time()
        for epoch in range(opts.epoch_start, opts.epoch_start + opts.n_epochs):

            train_epoch(model, optimizer, baseline, lr_scheduler, epoch,
                        val_dataset, problem, tb_logger, opts, extra)

        finish = time.time()
        with open("experiments.log", "a+") as f:
            f.write("{} {:.4f} {} {:.2f}\n".format(
                '-'.join(opts.train_dataset.split('/')[-2:]),
                extra["avg_reward"], extra["best_epoch"], finish - start))
        print("Took {:.2f} sec for {} epochs".format(finish - start,
                                                     opts.n_epochs))
def run(opts):

    rank = opts.local_rank if torch.cuda.device_count() > 1 else 0

    # Set the random seed
    torch.manual_seed(opts.seed + rank)
    random.seed(opts.seed + rank)
    np.random.seed(opts.seed + rank)

    if not os.path.exists(opts.save_dir) and rank == 0:
        os.makedirs(opts.save_dir)

    # Optionally configure wandb
    if not opts.no_wandb and rank == 0:
        wandb.login('never', '31ce01e4120061694da54a54ab0dafbee1262420')
        wandb.init(dir=opts.save_dir,
                   config=opts,
                   project='large_scale_tsp',
                   name=opts.run_name,
                   sync_tensorboard=True,
                   save_code=True)

    # Set the device
    if opts.use_cuda:
        torch.cuda.set_device(rank)
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')
        opts.device = torch.device("cuda", rank)

    else:
        opts.device = torch.device("cpu")

    # Figure out what's the problem
    problem = load_problem(opts.problem)

    # Load data from load_path
    load_data = {}
    assert opts.load_path is None or opts.resume is None, "Only one of load path and resume can be given"
    load_path = opts.load_path if opts.load_path is not None else opts.resume
    if load_path is not None:
        if rank == 0:
            print('  [*] Loading data from {}'.format(load_path))
        load_data = torch_load_cpu(load_path)

    # Initialize model
    model_class = {
        'attention': AttentionModel,
        'pointer': PointerNetwork
    }.get(opts.model, None)
    assert model_class is not None, "Unknown model: {}".format(model_class)
    model: torch.nn.Module = model_class(
        opts.embedding_dim,
        opts.hidden_dim,
        problem,
        attention_type=opts.attention_type,
        n_encode_layers=opts.n_encode_layers,
        n_heads=opts.n_heads,
        feed_forward_dim=opts.feed_forward_dim,
        encoding_knn_size=opts.encoding_knn_size,
        decoding_knn_size=opts.decoding_knn_size,
        mask_inner=True,
        mask_logits=True,
        normalization=opts.normalization,
        tanh_clipping=opts.tanh_clipping,
        checkpoint_encoder=opts.checkpoint_encoder,
        shrink_size=opts.shrink_size).to(opts.device)

    if opts.init_normalization_parameters:
        for m in model.modules():
            if isinstance(m, Normalization):
                m.init_parameters()

    if opts.use_cuda:
        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(
            opts.device)
        model = DDP(model, device_ids=[rank])

    # Overwrite model parameters by parameters to load
    model_ = get_inner_model(model)
    model_.load_state_dict({
        **model_.state_dict(),
        **load_data.get('model', {})
    })

    # Initialize baseline
    if opts.baseline == 'exponential':
        baseline = ExponentialBaseline(opts.exp_beta)
    elif opts.baseline == 'critic' or opts.baseline == 'critic_lstm':
        assert problem.NAME == 'tsp', "Critic only supported for TSP"
        baseline = CriticBaseline(
            (CriticNetworkLSTM(2, opts.embedding_dim, opts.hidden_dim,
                               opts.n_encode_layers, opts.tanh_clipping)
             if opts.baseline == 'critic_lstm' else CriticNetwork(
                 2, opts.embedding_dim, opts.hidden_dim, opts.n_encode_layers,
                 opts.normalization)).to(opts.device))
    elif opts.baseline == 'rollout':
        baseline = RolloutBaseline(model, problem, opts)
    else:
        assert opts.baseline is None, "Unknown baseline: {}".format(
            opts.baseline)
        baseline = NoBaseline()

    if opts.bl_warmup_epochs > 0:
        baseline = WarmupBaseline(baseline,
                                  opts.bl_warmup_epochs,
                                  warmup_exp_beta=opts.exp_beta)

    # Load baseline from data, make sure script is called with same type of baseline
    if 'baseline' in load_data:
        baseline.load_state_dict(load_data['baseline'])

    # Initialize optimizer
    optimizer = optim.Adam([{
        'params': model.parameters(),
        'lr': opts.lr_model
    }] + ([{
        'params': baseline.get_learnable_parameters(),
        'lr': opts.lr_critic
    }] if len(baseline.get_learnable_parameters()) > 0 else []))

    scaler = torch.cuda.amp.GradScaler() if opts.precision == 16 else None

    # Load optimizer state
    if 'optimizer' in load_data:
        optimizer.load_state_dict(load_data['optimizer'])
        for state in optimizer.state.values():
            for k, v in state.items():
                # if isinstance(v, torch.Tensor):
                if torch.is_tensor(v):
                    state[k] = v.to(opts.device)

    # Initialize learning rate scheduler, decay by lr_decay once per epoch!
    lr_scheduler = optim.lr_scheduler.LambdaLR(
        optimizer, lambda epoch: opts.lr_decay**epoch)

    # Start the actual training loop
    val_dataset = problem.make_dataset(size=opts.graph_size,
                                       num_samples=opts.val_size,
                                       filename=opts.val_dataset,
                                       distribution=opts.data_distribution)

    if opts.resume:
        epoch_resume = int(
            os.path.splitext(os.path.split(opts.resume)[-1])[0].split("-")[1])

        torch.set_rng_state(load_data['rng_state'])
        if opts.use_cuda:
            torch.cuda.set_rng_state_all(load_data['cuda_rng_state'])
        # Set the random states
        # Dumping of state was done before epoch callback, so do that now (model is loaded)
        baseline.epoch_callback(model, epoch_resume)
        if rank == 0:
            print("Resuming after {}".format(epoch_resume))
        opts.epoch_start = epoch_resume + 1

    if opts.eval_only:
        validate(model, val_dataset, opts)
    else:
        for epoch in range(opts.epoch_start, opts.epoch_start + opts.n_epochs):
            train_epoch(model, optimizer, scaler, baseline, lr_scheduler,
                        epoch, val_dataset, problem, opts)
Beispiel #7
0
def run(opts):
    # start time
    start_time = time()
    train_run = []
    opts.save_hrs.sort()
    run_name = opts.run_name

    # Pretty print the run args
    pp.pprint(vars(opts))

    # Set the random seed
    torch.manual_seed(opts.seed)

    # Optionally configure tensorboard
    tb_logger = None
    if not opts.no_tensorboard:
        tb_logger = TbLogger(
            os.path.join(opts.log_dir, "{}_{}".format(opts.problem,
                                                      opts.graph_size),
                         opts.run_name))

    os.makedirs(opts.save_dir)
    # Save arguments so exact configuration can always be found
    with open(os.path.join(opts.save_dir, "args.json"), 'w') as f:
        json.dump(vars(opts), f, indent=True)

    # Set the device
    opts.device = torch.device("cuda:0" if opts.use_cuda else "cpu")

    # Figure out what's the problem
    problem = load_problem(opts.problem)

    # Load data from load_path
    load_data = {}
    assert opts.load_path is None or opts.resume is None, "Only one of load path and resume can be given"
    load_path = opts.load_path if opts.load_path is not None else opts.resume
    if load_path is not None:
        print('  [*] Loading data from {}'.format(load_path))
        load_data = torch_load_cpu(load_path)

    # Initialize model
    model_class = {
        'attention': AttentionModel,
        'pointer': PointerNetwork
    }.get(opts.model, None)
    assert model_class is not None, "Unknown model: {}".format(model_class)
    model = model_class(opts.embedding_dim,
                        opts.hidden_dim,
                        problem,
                        n_encode_layers=opts.n_encode_layers,
                        mask_inner=True,
                        mask_logits=True,
                        normalization=opts.normalization,
                        tanh_clipping=opts.tanh_clipping,
                        checkpoint_encoder=opts.checkpoint_encoder,
                        shrink_size=opts.shrink_size).to(opts.device)

    if opts.use_cuda and torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model)

    # Overwrite model parameters by parameters to load
    model_ = get_inner_model(model)
    model_.load_state_dict({
        **model_.state_dict(),
        **load_data.get('model', {})
    })

    # Initialize baseline
    if opts.baseline == 'exponential':
        baseline = ExponentialBaseline(opts.exp_beta)
    elif opts.baseline == 'critic' or opts.baseline == 'critic_lstm':
        assert problem.NAME == 'tsp', "Critic only supported for TSP"
        baseline = CriticBaseline(
            (CriticNetworkLSTM(2, opts.embedding_dim, opts.hidden_dim,
                               opts.n_encode_layers, opts.tanh_clipping)
             if opts.baseline == 'critic_lstm' else CriticNetwork(
                 2, opts.embedding_dim, opts.hidden_dim, opts.n_encode_layers,
                 opts.normalization)).to(opts.device))
    elif opts.baseline == 'rollout':
        baseline = RolloutBaseline(model, problem, opts)
    else:
        assert opts.baseline is None, "Unknown baseline: {}".format(
            opts.baseline)
        baseline = NoBaseline()

    if opts.bl_warmup_epochs > 0:
        baseline = WarmupBaseline(baseline,
                                  opts.bl_warmup_epochs,
                                  warmup_exp_beta=opts.exp_beta)

    # Load baseline from data, make sure script is called with same type of baseline
    if 'baseline' in load_data:
        baseline.load_state_dict(load_data['baseline'])

    # Initialize optimizer
    optimizer = optim.Adam([{
        'params': model.parameters(),
        'lr': opts.lr_model
    }] + ([{
        'params': baseline.get_learnable_parameters(),
        'lr': opts.lr_critic
    }] if len(baseline.get_learnable_parameters()) > 0 else []))

    # Load optimizer state
    if 'optimizer' in load_data:
        optimizer.load_state_dict(load_data['optimizer'])
        for state in optimizer.state.values():
            for k, v in state.items():
                # if isinstance(v, torch.Tensor):
                if torch.is_tensor(v):
                    state[k] = v.to(opts.device)

    # Initialize learning rate scheduler, decay by lr_decay once per epoch!
    lr_scheduler = optim.lr_scheduler.LambdaLR(
        optimizer, lambda epoch: opts.lr_decay**epoch)

    # Start the actual training loop
    val_dataset = problem.make_dataset(size=opts.graph_size,
                                       num_samples=opts.val_size,
                                       filename=opts.val_dataset,
                                       distribution=opts.data_distribution)

    if opts.resume:
        epoch_resume = int(
            os.path.splitext(os.path.split(opts.resume)[-1])[0].split("-")[1])

        torch.set_rng_state(load_data['rng_state'])
        if opts.use_cuda:
            torch.cuda.set_rng_state_all(load_data['cuda_rng_state'])
        # Set the random states
        # Dumping of state was done before epoch callback, so do that now (model is loaded)
        baseline.epoch_callback(model, epoch_resume)
        print("Resuming after {}".format(epoch_resume))
        opts.epoch_start = epoch_resume + 1

    torch.save(model, os.path.join('.', 'empty.pt'))
    if opts.eval_only:
        validate(model, val_dataset, opts)
    else:
        for epoch in range(opts.epoch_start, opts.epoch_start + opts.n_epochs):
            avg_time = train_epoch(model, optimizer, baseline, lr_scheduler,
                                   epoch, val_dataset, problem, tb_logger,
                                   opts, start_time)
            train_run.append(avg_time)
            for hr in opts.save_hrs:
                if (time() - start_time) > hr * 3600:
                    opts.save_hrs.remove(hr)
                    print('Saving model and state...')
                    hr_time = int(round((time() - start_time) / 3600))
                    with open(
                            '../models/att/hist_{}_{}hr.pickle'.format(
                                run_name, hr_time), 'wb') as handle:
                        pickle.dump(train_run,
                                    handle,
                                    protocol=pickle.HIGHEST_PROTOCOL)
                    torch.save(
                        {
                            'model': get_inner_model(model).state_dict(),
                            'optimizer': optimizer.state_dict(),
                            'rng_state': torch.get_rng_state(),
                            'cuda_rng_state': torch.cuda.get_rng_state_all(),
                            'baseline': baseline.state_dict()
                        },
                        os.path.join(
                            '../models/att',
                            '{}_{}hr-model-att-only.pt'.format(
                                run_name, hr_time)))
                    torch.save(
                        model,
                        os.path.join(
                            '../models/att',
                            '{}_{}hr-model.pt'.format(run_name, hr_time)))
Beispiel #8
0
                                   num_samples=opts.val_size,
                                   filename=opts.val_dataset)
torch.save(val_dataset, 'myval_test100.pt')

val_dataset = torch.load('myval_test100.pt')
val_dataset = val_dataset[0:200]

# In[4]:

# Load data from load_path
load_data = {}
assert opts.load_path is None or opts.resume is None, "Only one of load path and resume can be given"
load_path = opts.load_path if opts.load_path is not None else opts.resume
if load_path is not None:
    print('  [*] Loading data from {}'.format(load_path))
    load_data = load_data = torch_load_cpu(load_path)

# Initialize model
model_class = AttentionModel
model = maybe_cuda_model(
    model_class(opts.embedding_dim,
                opts.hidden_dim,
                problem,
                n_encode_layers=opts.n_encode_layers,
                mask_inner=True,
                mask_logits=True,
                normalization=opts.normalization,
                tanh_clipping=opts.tanh_clipping), opts.use_cuda)

# Overwrite model parameters by parameters to load
model_ = get_inner_model(model)
Beispiel #9
0
def run(opts):

    # Pretty print the run args
    pp.pprint(vars(opts))

    # Set the random seed
    torch.manual_seed(opts.seed)

    # Optionally configure tensorboard
    tb_logger = None
    if not opts.no_tensorboard and opts.no_dirpg:
        tb_logger = TbLogger(
            os.path.join(opts.log_dir, "{}_{}".format(opts.problem,
                                                      opts.graph_size),
                         opts.run_name))
    if not opts.no_dirpg:
        task = Task.init(project_name='DirPG-TSP', task_name=opts.run_name)
        tb_logger = SummaryWriter(
            os.path.join(opts.log_dir, "{}_{}".format(opts.problem,
                                                      opts.graph_size),
                         opts.run_name))
        tb_logger.add_text('Comment', opts.comment, 0)

    os.makedirs(opts.save_dir)
    # Save arguments so exact configuration can always be found
    with open(os.path.join(opts.save_dir, "args.json"), 'w') as f:
        json.dump(vars(opts), f, indent=True)

    # Set the device
    opts.device = torch.device("cuda:0" if opts.use_cuda else "cpu")

    # Figure out what's the problem
    problem = load_problem(opts.problem)

    # Load data from load_path
    load_data = {}
    assert opts.load_path is None or opts.resume is None, "Only one of load path and resume can be given"
    load_path = opts.load_path if opts.load_path is not None else opts.resume
    if load_path is not None:
        print('  [*] Loading data from {}'.format(load_path))
        load_data = torch_load_cpu(load_path)

    # Initialize model
    model_class = {
        'attention': AttentionModel,
        'pointer': PointerNetwork
    }.get(opts.model, None)
    assert model_class is not None, "Unknown model: {}".format(model_class)
    model = model_class(opts.embedding_dim,
                        opts.hidden_dim,
                        problem,
                        n_encode_layers=opts.n_encode_layers,
                        mask_inner=True,
                        mask_logits=True,
                        normalization=opts.normalization,
                        tanh_clipping=opts.tanh_clipping,
                        checkpoint_encoder=opts.checkpoint_encoder,
                        shrink_size=opts.shrink_size).to(opts.device)

    if opts.use_cuda and torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model)

    # Overwrite model parameters by parameters to load
    model_ = get_inner_model(model)
    model_.load_state_dict({
        **model_.state_dict(),
        **load_data.get('model', {})
    })
    print(model_)
    # Initialize baseline
    if opts.baseline == 'exponential':
        baseline = ExponentialBaseline(opts.exp_beta)
    elif opts.baseline == 'critic' or opts.baseline == 'critic_lstm':
        assert problem.NAME == 'tsp', "Critic only supported for TSP"
        baseline = CriticBaseline(
            (CriticNetworkLSTM(2, opts.embedding_dim, opts.hidden_dim,
                               opts.n_encode_layers, opts.tanh_clipping)
             if opts.baseline == 'critic_lstm' else CriticNetwork(
                 2, opts.embedding_dim, opts.hidden_dim, opts.n_encode_layers,
                 opts.normalization)).to(opts.device))
    elif opts.baseline == 'rollout':
        baseline = RolloutBaseline(model, problem, opts)
        print(" rollout" * 30)
    else:
        assert opts.baseline is None, "Unknown baseline: {}".format(
            opts.baseline)
        baseline = NoBaseline()

    if opts.bl_warmup_epochs > 0:
        print(opts.bl_warmup_epochs)
        baseline = WarmupBaseline(baseline,
                                  opts.bl_warmup_epochs,
                                  warmup_exp_beta=opts.exp_beta)
        print(" WarmupBaseline" * 30)

    # Load baseline from data, make sure script is called with same type of baseline
    if 'baseline' in load_data:
        baseline.load_state_dict(load_data['baseline'])

    # Initialize optimizer
    optimizer = optim.Adam([{
        'params': model.parameters(),
        'lr': opts.lr_model
    }] + ([{
        'params': baseline.get_learnable_parameters(),
        'lr': opts.lr_critic
    }] if len(baseline.get_learnable_parameters()) > 0 else []))

    # Load optimizer state
    if 'optimizer' in load_data:
        optimizer.load_state_dict(load_data['optimizer'])
        for state in optimizer.state.values():
            for k, v in state.items():
                # if isinstance(v, torch.Tensor):
                if torch.is_tensor(v):
                    state[k] = v.to(opts.device)

    # Initialize learning rate scheduler, decay by lr_decay once per epoch!
    lr_scheduler = optim.lr_scheduler.LambdaLR(
        optimizer, lambda epoch: opts.lr_decay**epoch)

    # Start the actual training loop
    val_dataset = problem.make_dataset(size=opts.graph_size,
                                       num_samples=opts.val_size,
                                       filename=opts.val_dataset,
                                       distribution=opts.data_distribution)

    if opts.resume:
        epoch_resume = int(
            os.path.splitext(os.path.split(opts.resume)[-1])[0].split("-")[1])

        torch.set_rng_state(load_data['rng_state'])
        if opts.use_cuda:
            torch.cuda.set_rng_state_all(load_data['cuda_rng_state'])
        # Set the random states
        # Dumping of state was done before epoch callback, so do that now (model is loaded)
        baseline.epoch_callback(model, epoch_resume)
        print("Resuming after {}".format(epoch_resume))
        opts.epoch_start = epoch_resume + 1

    model = dirpg.DirPG(model, opts) if not opts.no_dirpg else model
    if opts.eval_only:
        validate(model, val_dataset, opts)
    else:
        interactions_count = opts.epoch_start * opts.epoch_size * opts.max_interactions
        epoch = opts.epoch_start
        while interactions_count < opts.total_interactions:  # for epoch in range(opts.epoch_start, opts.epoch_start + opts.n_epochs):
            train_epoch(
                model,
                optimizer,
                baseline,
                lr_scheduler,
                epoch,
                interactions_count,
                val_dataset,
                problem,
                tb_logger,
                opts,
            )
            print("interactions_count model so far ", interactions_count)
            n_interactions = get_inner_model(model).get_and_reset_interactions(opts.use_cuda, opts.no_dirpg)\
                if opts.no_dirpg else model.model.get_and_reset_interactions(opts.use_cuda, opts.no_dirpg)
            interactions_count += n_interactions

            print("interactions_count model new", n_interactions)
            interactions_count += get_inner_model(baseline.baseline.model).get_and_reset_interactions(opts.use_cuda, opts.no_dirpg)\
                if baseline.__class__.__name__ != "NoBaseline" else 0
            print("interactions_count baseline ", interactions_count)
            print("interactions_count: {} out of {} ".format(
                interactions_count, opts.total_interactions))
            epoch += 1
Beispiel #10
0
def run(opts):

    # Pretty print the run args
    pprint.pprint(vars(opts))

    # Set the random seed
    torch.manual_seed(opts.seed)

    # Optionally configure tensorboard
    tb_logger = None
    if not opts.no_tensorboard:
        tb_logger = TbLogger(
            os.path.join(opts.log_dir, "{}_{}".format(opts.problem,
                                                      opts.graph_size),
                         opts.run_name))

    if not os.path.exists(opts.save_dir):
        os.makedirs(opts.save_dir)

    # Save arguments so exact configuration can always be found
    with open(os.path.join(opts.save_dir, "args.json"), 'w') as f:
        json.dump(vars(opts), f, indent=True)

    # Set the device
    opts.device = torch.device("cuda" if opts.use_cuda else "cpu")

    # Figure out what's the problem
    problem = load_problem(opts.problem)(p_size=opts.graph_size,
                                         with_assert=not opts.no_assert)

    # Load data from load_path
    load_data = {}
    assert opts.load_path is None or opts.resume is None, "Only one of load path and resume can be given"
    load_path = opts.load_path if opts.load_path is not None else opts.resume
    if load_path is not None:
        print('  [*] Loading data from {}'.format(load_path))
        load_data = torch_load_cpu(load_path)

    # Initialize model
    model_class = {
        'attention': AttentionModel,
    }.get(opts.model, None)
    assert model_class is not None, "Unknown model: {}".format(model_class)
    model = model_class(problem=problem,
                        embedding_dim=opts.embedding_dim,
                        hidden_dim=opts.hidden_dim,
                        n_heads=opts.n_heads_encoder,
                        n_layers=opts.n_encode_layers,
                        normalization=opts.normalization,
                        device=opts.device).to(opts.device)

    if opts.use_cuda and torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model)

    # Overwrite model parameters by parameters to load
    model_ = get_inner_model(model)
    model_.load_state_dict({
        **model_.state_dict(),
        **load_data.get('model', {})
    })

    # Load the validation datasets
    val_dataset = problem.make_dataset(size=opts.graph_size,
                                       num_samples=opts.val_size,
                                       filename=opts.val_dataset)

    # Do validation only
    if opts.eval_only:
        validate(problem, model, val_dataset, tb_logger, opts, _id=0)

    else:

        # Initialize baseline
        baseline = CriticBaseline(
            CriticNetwork(problem=problem,
                          embedding_dim=opts.embedding_dim,
                          hidden_dim=opts.hidden_dim,
                          n_heads=opts.n_heads_decoder,
                          n_layers=opts.n_encode_layers,
                          normalization=opts.normalization,
                          device=opts.device).to(opts.device))

        # Load baseline from data, make sure script is called with same type of baseline
        if 'baseline' in load_data:
            baseline.load_state_dict(load_data['baseline'])

        # Initialize optimizer
        optimizer = optim.Adam([{
            'params': model.parameters(),
            'lr': opts.lr_model
        }] + ([{
            'params': baseline.get_learnable_parameters(),
            'lr': opts.lr_critic
        }] if len(baseline.get_learnable_parameters()) > 0 else []))

        # Load optimizer state
        if 'optimizer' in load_data:
            optimizer.load_state_dict(load_data['optimizer'])
            for state in optimizer.state.values():
                for k, v in state.items():
                    # if isinstance(v, torch.Tensor):
                    if torch.is_tensor(v):
                        state[k] = v.to(opts.device)

        # Initialize learning rate scheduler, decay by lr_decay once per epoch!
        lr_scheduler = optim.lr_scheduler.LambdaLR(
            optimizer, lambda epoch: opts.lr_decay**epoch)

        if opts.resume:
            epoch_resume = int(
                os.path.splitext(os.path.split(
                    opts.resume)[-1])[0].split("-")[1])

            torch.set_rng_state(load_data['rng_state'])
            if opts.use_cuda:
                torch.cuda.set_rng_state_all(load_data['cuda_rng_state'])
            # Set the random states
            # Dumping of state was done before epoch callback, so do that now (model is loaded)
            print("Resuming after {}".format(epoch_resume))
            opts.epoch_start = epoch_resume + 1

        # Start the actual training loop
        for epoch in range(opts.epoch_start, opts.epoch_start + opts.n_epochs):
            train_epoch(problem, model, optimizer, baseline, lr_scheduler,
                        epoch, val_dataset, tb_logger, opts)
Beispiel #11
0
def run(opts):

    # Pretty print the run args
    pp.pprint(vars(opts))

    # Set the random seed
    torch.manual_seed(opts.seed)

    # Optionally configure tensorboard
    tb_logger = None
    if not opts.no_tensorboard:
        tb_logger = TbLogger(
            os.path.join(opts.log_dir, "{}_{}".format(opts.problem,
                                                      opts.graph_size),
                         opts.run_name))

    os.makedirs(opts.save_dir)
    # Save arguments so exact configuration can always be found
    with open(os.path.join(opts.save_dir, "args.json"), 'w') as f:
        json.dump(vars(opts), f, indent=True)

    # Set the device
    opts.device = torch.device("cuda:0" if opts.use_cuda else "cpu")

    # Figure out what's the problem
    problem = load_problem(opts.problem)

    # Load data from load_path
    load_data = {}
    assert opts.load_path is None or opts.resume is None, "Only one of load path and resume can be given"
    load_path = opts.load_path if opts.load_path is not None else opts.resume
    if load_path is not None:
        print('  [*] Loading data from {}'.format(load_path))
        load_data = torch_load_cpu(load_path)

    # Initialize model
    model = AttentionModel(opts.embedding_dim,
                           opts.hidden_dim,
                           problem,
                           n_encode_layers=opts.n_encode_layers,
                           mask_inner=True,
                           mask_logits=True,
                           normalization=opts.normalization,
                           tanh_clipping=opts.tanh_clipping,
                           checkpoint_encoder=opts.checkpoint_encoder,
                           shrink_size=opts.shrink_size).to(opts.device)

    if opts.use_cuda and torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model)

    # Overwrite model parameters by parameters to load
    model_ = get_inner_model(model)
    model_.load_state_dict({
        **model_.state_dict(),
        **load_data.get('model', {})
    })

    # Initialize baseline
    if opts.baseline == 'critic':
        baseline = CriticBaseline(
            (CriticNetwork(2, opts.embedding_dim, opts.hidden_dim,
                           opts.n_encode_layers,
                           opts.normalization)).to(opts.device))
    elif opts.baseline == 'rollout':
        baseline = RolloutBaseline(model, problem, opts)
    elif opts.baseline == 'oracle':
        baseline = OracleBaseline()
    else:
        assert opts.baseline is None, "Unknown baseline: {}".format(
            opts.baseline)
        baseline = NoBaseline()

    # Load baseline from data, make sure script is called with same type of baseline
    if 'baseline' in load_data:
        baseline.load_state_dict(load_data['baseline'])

    # Initialize optimizer
    optimizer = optim.Adam([{
        'params': model.parameters(),
        'lr': opts.lr_model
    }] + ([{
        'params': baseline.get_learnable_parameters(),
        'lr': opts.lr_critic
    }] if len(baseline.get_learnable_parameters()) > 0 else []))

    # Load optimizer state
    if 'optimizer' in load_data:
        optimizer.load_state_dict(load_data['optimizer'])
        for state in optimizer.state.values():
            for k, v in state.items():
                # if isinstance(v, torch.Tensor):
                if torch.is_tensor(v):
                    state[k] = v.to(opts.device)

    # Initialize learning rate scheduler, decay by lr_decay once per epoch!
    lr_scheduler = optim.lr_scheduler.LambdaLR(
        optimizer, lambda epoch: opts.lr_decay**epoch)

    # Start the actual training loop
    val_dataset = problem.make_dataset(size=opts.graph_size,
                                       num_samples=opts.val_size,
                                       filename=opts.val_dataset,
                                       distribution=opts.data_distribution)

    val_dataset_tensor = torch.stack(val_dataset.data)
    dist = (val_dataset_tensor.transpose(1, 2).repeat_interleave(
        opts.graph_size, 2).transpose(1, 2).float() -
            val_dataset_tensor.repeat(1, opts.graph_size, 1).float()).norm(
                p=2, dim=2).view(opts.val_size, opts.graph_size,
                                 opts.graph_size)
    DP_val_solution = [held_karp(dist[i])[0] for i in range(opts.val_size)]
    DP_val_solution = torch.tensor(DP_val_solution)
    DP_val_solution = DP_val_solution.mean()
    problem.DP_cost = DP_val_solution
    print('problem_DPCost = ', DP_val_solution)

    if opts.resume:
        epoch_resume = int(
            os.path.splitext(os.path.split(opts.resume)[-1])[0].split("-")[1])

        torch.set_rng_state(load_data['rng_state'])
        if opts.use_cuda:
            torch.cuda.set_rng_state_all(load_data['cuda_rng_state'])
        # Set the random states
        # Dumping of state was done before epoch callback, so do that now (model is loaded)
        baseline.epoch_callback(model, epoch_resume)
        print("Resuming after {}".format(epoch_resume))
        opts.epoch_start = epoch_resume + 1

    if opts.eval_only:
        validate(model, val_dataset, opts)
    else:
        for epoch in range(opts.epoch_start, opts.epoch_start + opts.n_epochs):
            train_epoch(model, optimizer, baseline, lr_scheduler, epoch,
                        val_dataset, problem, tb_logger, opts)