Ejemplo n.º 1
0
def train():
    child_params = get_child_model_params()
    controller_params = get_controller_params()
    corpus = data.Corpus(child_params['data_dir'])
    eval_batch_size = child_params['eval_batch_size']

    train_data = batchify(corpus.train, child_params['batch_size'],
                          child_params['cuda'])
    val_data = batchify(corpus.valid, eval_batch_size, child_params['cuda'])
    ntokens = len(corpus.dictionary)

    if os.path.exists(os.path.join(child_params['model_dir'], 'model.pt')):
        print("Found model.pt in {}, automatically continue training.".format(
            os.path.join(child_params['model_dir'])))
        continue_train_child = True
    else:
        continue_train_child = False

    if continue_train_child:
        child_model = torch.load(
            os.path.join(child_params['model_dir'], 'model.pt'))
    else:
        child_model = model_search.RNNModelSearch(
            ntokens, child_params['emsize'], child_params['nhid'],
            child_params['nhidlast'], child_params['dropout'],
            child_params['dropouth'], child_params['dropoutx'],
            child_params['dropouti'], child_params['dropoute'],
            child_params['drop_path'])

    if os.path.exists(os.path.join(controller_params['model_dir'],
                                   'model.pt')):
        print("Found model.pt in {}, automatically continue training.".format(
            os.path.join(child_params['model_dir'])))
        continue_train_controller = True
    else:
        continue_train_controller = False

    if continue_train_controller:
        controller_model = torch.load(
            os.path.join(controller_params['model_dir'], 'model.pt'))
    else:
        controller_model = controller.Controller(controller_params)

    size = 0
    for p in child_model.parameters():
        size += p.nelement()
    logging.info('child model param size: {}'.format(size))
    size = 0
    for p in controller_model.parameters():
        size += p.nelement()
    logging.info('controller model param size: {}'.format(size))

    if args.cuda:
        if args.single_gpu:
            parallel_child_model = child_model.cuda()
            parallel_controller_model = controller_model.cuda()
        else:
            parallel_child_model = nn.DataParallel(child_model, dim=1).cuda()
            parallel_controller_model = nn.DataParallel(controller_model,
                                                        dim=1).cuda()
    else:
        parallel_child_model = child_model
        parallel_controller_model = controller_model

    total_params = sum(x.data.nelement() for x in child_model.parameters())
    logging.info('Args: {}'.format(args))
    logging.info('Child Model total parameters: {}'.format(total_params))
    total_params = sum(x.data.nelement()
                       for x in controller_model.parameters())
    logging.info('Args: {}'.format(args))
    logging.info('Controller Model total parameters: {}'.format(total_params))

    # Loop over epochs.

    if continue_train_child:
        optimizer_state = torch.load(
            os.path.join(child_params['model_dir'], 'optimizer.pt'))
        if 't0' in optimizer_state['param_groups'][0]:
            child_optimizer = torch.optim.ASGD(
                child_model.parameters(),
                lr=child_params['lr'],
                t0=0,
                lambd=0.,
                weight_decay=child_params['wdecay'])
        else:
            child_optimizer = torch.optim.SGD(
                child_model.parameters(),
                lr=child_params['lr'],
                weight_decay=child_params['wdecay'])
        child_optimizer.load_state_dict(optimizer_state)
        child_epoch = torch.load(
            os.path.join(child_params['model_dir'], 'misc.pt'))['epoch'] - 1
    else:
        child_optimizer = torch.optim.SGD(child_model.parameters(),
                                          lr=child_params['lr'],
                                          weight_decay=child_params['wdecay'])
        child_epoch = 0

    if continue_train_controller:
        optimizer_state = torch.load(
            os.path.join(controller_params['model_dir'], 'optimizer.pt'))
        controller_optimizer = torch.optim.Adam(
            controller_model.parameters(),
            lr=controller_params['lr'],
            weight_decay=controller_params['weight_decay'])
        controller_optimizer.load_state_dict(optimizer_state)
        controller_epoch = torch.load(
            os.path.join(controller_params['model_dir'],
                         'misc.pt'))['epoch'] - 1
    else:
        controller_optimizer = torch.optim.Adam(
            controller_model.parameters(),
            lr=controller_params['lr'],
            weight_decay=controller_params['weight_decay'])
        controller_epoch = 0
    eval_every_epochs = child_params['eval_every_epochs']
    while True:
        # Train child model
        if child_params['arch_pool'] is None:
            arch_pool = generate_arch(
                controller_params['num_seed_arch'])  #[[arch]]
            child_params['arch_pool'] = arch_pool
        child_params['arch'] = None

        if isinstance(eval_every_epochs, int):
            child_params['eval_every_epochs'] = eval_every_epochs
        else:
            eval_every_epochs = list(map(int, eval_every_epochs))
            for index, e in enumerate(eval_every_epochs):
                if child_epoch < e:
                    child_params['eval_every_epochs'] = e
                    break

        for e in range(child_params['eval_every_epochs']):
            child_epoch += 1
            model_search.train(train_data, child_model, parallel_child_model,
                               child_optimizer, child_params, child_epoch)
            if child_epoch % child_params['eval_every_epochs'] == 0:
                save_checkpoint(child_model, child_optimizer, child_epoch,
                                child_params['model_dir'])
                logging.info('Saving Model!')
            if child_epoch >= child_params['train_epochs']:
                break

        # Evaluate seed archs
        valid_accuracy_list = model_search.evaluate(val_data, child_model,
                                                    parallel_child_model,
                                                    child_params,
                                                    eval_batch_size)

        # Output archs and evaluated error rate
        old_archs = child_params['arch_pool']
        old_archs_perf = valid_accuracy_list

        old_archs_sorted_indices = np.argsort(old_archs_perf)
        old_archs = np.array(old_archs)[old_archs_sorted_indices].tolist()
        old_archs_perf = np.array(
            old_archs_perf)[old_archs_sorted_indices].tolist()
        with open(
                os.path.join(child_params['model_dir'],
                             'arch_pool.{}'.format(child_epoch)), 'w') as fa:
            with open(
                    os.path.join(child_params['model_dir'],
                                 'arch_pool.perf.{}'.format(child_epoch)),
                    'w') as fp:
                with open(os.path.join(child_params['model_dir'], 'arch_pool'),
                          'w') as fa_latest:
                    with open(
                            os.path.join(child_params['model_dir'],
                                         'arch_pool.perf'), 'w') as fp_latest:
                        for arch, perf in zip(old_archs, old_archs_perf):
                            arch = ' '.join(map(str, arch))
                            fa.write('{}\n'.format(arch))
                            fa_latest.write('{}\n'.format(arch))
                            fp.write('{}\n'.format(perf))
                            fp_latest.write('{}\n'.format(perf))

        if child_epoch >= child_params['train_epochs']:
            logging.info('Training finished!')
            break

        # Train Encoder-Predictor-Decoder
        # [[arch]]
        encoder_input = list(map(lambda x: parse_arch_to_seq(x), old_archs))
        encoder_target = normalize_target(old_archs_perf)
        decoder_target = copy.copy(encoder_input)
        controller_params['batches_per_epoch'] = math.ceil(
            len(encoder_input) / controller_params['batch_size'])
        controller_epoch = controller.train(encoder_input, encoder_target,
                                            decoder_target, controller_model,
                                            parallel_controller_model,
                                            controller_optimizer,
                                            controller_params,
                                            controller_epoch)

        # Generate new archs
        new_archs = []
        controller_params['predict_lambda'] = 0
        top100_archs = list(
            map(lambda x: parse_arch_to_seq(x), old_archs[:100]))
        max_step_size = controller_params['max_step_size']
        while len(new_archs) < controller_params['max_new_archs']:
            controller_params['predict_lambda'] += 1
            new_arch = controller.infer(top100_archs, controller_model,
                                        parallel_controller_model,
                                        controller_params)
            for arch in new_arch:
                if arch not in encoder_input and arch not in new_archs:
                    new_archs.append(arch)
                if len(new_archs) >= controller_params['max_new_archs']:
                    break
            logging.info('{} new archs generated now'.format(len(new_archs)))
            if controller_params['predict_lambda'] >= max_step_size:
                break
        #[[arch]]
        new_archs = list(map(lambda x: parse_seq_to_arch(x),
                             new_archs))  #[[arch]]
        num_new_archs = len(new_archs)
        logging.info("Generate {} new archs".format(num_new_archs))
        random_new_archs = generate_arch(50)
        new_arch_pool = old_archs[:len(old_archs) - num_new_archs -
                                  50] + new_archs + random_new_archs
        logging.info("Totally {} archs now to train".format(
            len(new_arch_pool)))
        child_params['arch_pool'] = new_arch_pool
        with open(os.path.join(child_params['model_dir'], 'arch_pool'),
                  'w') as f:
            for arch in new_arch_pool:
                arch = ' '.join(map(str, arch))
                f.write('{}\n'.format(arch))
Ejemplo n.º 2
0
def worker(gpu, ngpus_per_node, config_in):
    # init
    config = copy.deepcopy(config_in)
    args = config
    jobid = os.environ["SLURM_JOBID"]
    procid = int(os.environ["SLURM_PROCID"])
    config.gpu = gpu

    if config.gpu is not None:
        writer_name = "tb.{}-{:d}-{:d}".format(jobid, procid, gpu)
        logger_name = "{}.{}-{:d}-{:d}.search.log".format(config.name, jobid, procid, gpu)
        model_name = "{}-{:d}-{:d}-model.pt".format(jobid, procid, gpu)
        optimizer_name = "{}-{:d}-{:d}-optimizer.pt".format(jobid, procid, gpu)
        msic_name = "{}-{:d}-{:d}-misc.pt".format(jobid, procid, gpu)
        ck_name = "{}-{:d}-{:d}".format(jobid, procid, gpu)
    else:
        writer_name = "tb.{}-{:d}-all".format(jobid, procid)
        logger_name = "{}.{}-{:d}-all.search.log".format(config.name, jobid, procid)
        model_name = "{}-{:d}-all-model.pt".format(jobid, procid)
        optimizer_name = "{}-{:d}-all-optimizer.pt".format(jobid, procid)
        msic_name = "{}-{:d}-all-misc.pt".format(jobid, procid)
        ck_name = "{}-{:d}-all".format(jobid, procid)

    writer = SummaryWriter(log_dir=os.path.join(config.path, writer_name))
    # writer.add_text('config', config.as_markdown(), 0)
    logger = get_logger(os.path.join(config.path, logger_name))

    # get cuda device
    device = torch.device('cuda', gpu)

    # ==============================  begin  ==============================
    logger.info("Logger is set - training start")
    logger.info('Args: {}'.format(args))

    if config.dist_url == "env://" and config.rank == -1:
        config.rank = int(os.environ["RANK"])

    if config.mp_dist:
        # For multiprocessing distributed training, rank needs to be the
        # global rank among all the processes
        config.rank = config.rank * ngpus_per_node + gpu
    # print('back:{}, dist_url:{}, world_size:{}, rank:{}'.format(config.dist_backend, config.dist_url, config.world_size, config.rank))
    dist.init_process_group(backend=config.dist_backend, init_method=config.dist_url,
                            world_size=config.world_size, rank=config.rank)

    # get data
    corpus = data.Corpus(args.data)

    eval_batch_size = 10
    test_batch_size = 1

    train_data = batchify(corpus.train, args.batch_size, args)
    search_data = batchify(corpus.valid, args.batch_size, args)
    val_data = batchify(corpus.valid, eval_batch_size, args)
    test_data = batchify(corpus.test, test_batch_size, args)

    # split data ( with respect to GPU_id)
    def split_set(set_in):
        per_set_length = set_in.size(0) // config.world_size
        set_out = set_in[per_set_length*config.rank + 0: per_set_length*config.rank + per_set_length]
        return set_out
    train_data = split_set(train_data).to(device)
    search_data = split_set(search_data).to(device)
    val_data = split_set(val_data).to(device)
    test_data = split_set(test_data).to(device)

    if config.dist_privacy:
        logger.info("PRIVACY ENGINE ON")

    # build model
    ntokens = len(corpus.dictionary)
    if args.continue_train:
        model = torch.load(os.path.join(args.save, model_name))
    else:
        model = model_search.RNNModelSearch(ntokens, args.emsize, args.nhid, args.nhidlast,
                        args.dropout, args.dropouth, args.dropoutx, args.dropouti, args.dropoute)
    # make model distributed
    if config.gpu is not None:
        torch.cuda.set_device(config.gpu)
        # model = model.to(device)
        model.cuda(config.gpu)
        # When using a single GPU per process and per DistributedDataParallel, we need to divide
        # the batch size ourselves based on the total number of GPUs we have
        # config.batch_size = int(config.batch_size / ngpus_per_node)
        config.workers = int((config.workers + ngpus_per_node - 1) / ngpus_per_node)
        # model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[config.rank])
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[config.gpu])
        # model = torch.nn.parallel.DistributedDataParallel(model, device_ids=None, output_device=None)
    else:
        model.cuda()
        # DistributedDataParallel will divide and allocate batch_size to all
        # available GPUs if device_ids are not set
        model = torch.nn.parallel.DistributedDataParallel(model)

    architect = Architect(model.module, args)

    total_params = sum(x.data.nelement() for x in model.module.parameters())
    logger.info('Model total parameters: {}'.format(total_params))

    # Loop over epochs.
    lr = args.lr
    best_val_loss = []
    stored_loss = 100000000

    if args.continue_train:
        optimizer_state = torch.load(os.path.join(args.save, optimizer_name))
        if 't0' in optimizer_state['param_groups'][0]:
            optimizer = torch.optim.ASGD(model.module.parameters(), lr=args.lr, t0=0, lambd=0., weight_decay=args.wdecay)
        else:
            optimizer = torch.optim.SGD(model.module.parameters(), lr=args.lr, weight_decay=args.wdecay)
        optimizer.load_state_dict(optimizer_state)
    else:
        optimizer = torch.optim.SGD(model.module.parameters(), lr=args.lr, weight_decay=args.wdecay)

    for epoch in range(1, args.epochs+1):
        epoch_start_time = time.time()
        # train()
        train(model, architect, epoch, corpus, train_data, search_data, optimizer,
              device, logger, writer, args)

        val_loss = evaluate(model, corpus, args, val_data, eval_batch_size)
        logger.info('-' * 89)
        logger.info('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                        val_loss, math.exp(val_loss)))
        logger.info('-' * 89)

        writer.add_scalar('val/loss', val_loss, epoch)
        writer.add_scalar('val/ppl', math.exp(val_loss), epoch)

        if val_loss < stored_loss:
            save_checkpoint(model, optimizer, epoch, args.save, dist_name=ck_name)
            logger.info('Saving Normal!')
            stored_loss = val_loss

        best_val_loss.append(val_loss)

    test_loss = evaluate(model, corpus, args, test_data, test_batch_size)
    logger.info('=' * 89)
    logger.info('| End of training & Testing | test loss {:5.2f} | test ppl {:8.2f}'.format(
        test_loss, math.exp(test_loss)))
    logger.info('=' * 89)
Ejemplo n.º 3
0
train_data = batchify(corpus.train, args.batch_size)
search_data = batchify(corpus.valid, args.batch_size)
val_data = batchify(corpus.valid, eval_batch_size)
test_data = batchify(corpus.test, test_batch_size)


ntokens = len(corpus.dictionary)
if args.continue_train:
    model = torch.load(os.path.join(args.save, "model.pt"))
else:
    model = model.RNNModelSearch(
        ntokens,
        args.emsize,
        args.nhid,
        args.nhidlast,
        args.dropout,
        args.dropouth,
        args.dropoutx,
        args.dropouti,
        args.dropoute,
    )

size = 0
for p in model.parameters():
    size += p.nelement()
logging.info("param size: {}".format(size))
logging.info("initial genotype:")
logging.info(model.genotype())


if torch.cuda.is_available():
Ejemplo n.º 4
0
# val_data = batchify(corpus.valid, eval_batch_size, args)
# test_data = batchify(corpus.test, test_batch_size, args)

eval_batch_size = 10
test_batch_size = 1

# ntokens = len(corpus.dictionary)

ntokens = len(vocab.id2word)

if args.continue_train:
    model = torch.load(os.path.join(args.save, 'model.pt'))
else:
    model = model.RNNModelSearch(ntokens, args.emsize, args.nhid,
                                 args.nhidlast, args.dropout, args.dropouth,
                                 args.dropoutx, args.dropouti, args.dropoute,
                                 args.ner_dim,
                                 args.pos_dim, args.token_emb_path,
                                 len(constant.LABEL_TO_ID), args.pe_dim)

size = 0
for p in model.parameters():
    size += p.nelement()
logging.info('param size: {}'.format(size))
logging.info('initial genotype:')
logging.info(model.genotype())

if args.cuda:
    if args.single_gpu:
        parallel_model = model.cuda()
    else:
        parallel_model = nn.DataParallel(model, dim=1).cuda()