def __init__(self):
     self.decoder = SelfAttendedDecoder(vocab=data_utils.vocab, device_id=configs.decoder_device_id)
     # self.decoder.apply(init_weights)
     self.xe_loss = nn.CrossEntropyLoss(ignore_index=data_utils.vocab.padding_id)
     self.optimizer = DenseSparseAdam(self.decoder.parameters(), lr=configs.adam_lr) # , lr=configs.lr)
     # self.optimizer = optim.ASGD(self.model.parameters(), lr=configs.lr, weight_decay=configs.l2_weight_decay)
     self.lr_scheduler = lr_scheduler.ReduceLROnPlateau(
         self.optimizer, 'min',
         patience=configs.lr_scheduler_patience,
         factor=configs.lr_scheduler_factor, verbose=True
     )
     self.epoch_idx = 0
     self.min_ppl = 1000.
     self.ckpt_path = 'decoder.pretrained.params'
Exemple #2
0
def main():
    data_holder, task2id, id2task, num_feat, num_voc, num_char, tgt_dict, embeddings = Dataloader_onto.multitask_dataloader(
        pkl_path, num_task=num_task, batch_size=BATCH_SIZE)
    para = model_para
    task2label = {"conll2000": "chunk", "wsjpos": "POS", "ontonotes": "NER"}
    #task2label = {"conll2000": "chunk", "wsjpos": "POS", "conll2003": "NER"}
    #logger = Logger('./logs/'+str(args.gpu))
    para["id2task"] = id2task
    para["n_feats"] = num_feat
    para["n_vocs"] = num_voc
    para["n_tasks"] = num_task
    para["out_size"] = [
        len(tgt_dict[task2label[id2task[ids]]]) for ids in range(num_task)
    ]
    para["n_chars"] = num_char
    model = Model_s.build_model_cnn(para)
    model.Word_embeddings.apply_weights(embeddings)

    params = list(filter(lambda p: p.requires_grad, model.parameters()))
    num_params = sum(p.numel() for p in model.parameters())
    print(model)
    print("Num of paras:", num_params)
    print(model.concat_flag)

    def lr_decay(optimizer, epoch, decay_rate=0.05, init_lr=0.015):
        lr = init_lr / (1 + decay_rate * epoch)
        print(" Learning rate is set as:", lr)
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr
        return optimizer

    def exp_lr_decay(optimizer, epoch, decay_rate=0.05, init_lr=0.015):
        lr = init_lr * decay_rate**epoch
        print(" Learning rate is set as:", lr)
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr
        return optimizer

    if args.optim == "noam":
        model_optim = optim_custorm.NoamOpt(
            para["d_hid"], 1, 1000,
            DenseSparseAdam(params, lr=0.0015, betas=(0.9, 0.98), eps=1e-9))
        args.decay = None
    elif args.optim == "sgd":
        model_optim = optim.SGD(params,
                                lr=0.015,
                                momentum=args.momentum,
                                weight_decay=1e-8)
    elif args.optim == "adam":
        model_optim = optim.Adam(params,
                                 lr=0.0,
                                 betas=(0.9, 0.98),
                                 eps=1e-9,
                                 weight_decay=1e-8)
    if args.mode == "train":
        best_F1 = 0
        if not para["crf"]:
            calculate_loss = nn.NLLLoss()
        else:
            calculate_loss = None
            #calculate_loss = [CRFLoss_vb(len(tgt_dict[task2label[id2task[idx]]])+2, len(tgt_dict[task2label[id2task[idx]]]), len(tgt_dict[task2label[id2task[idx]]])+1) for idx in range(num_task)]
            #if USE_CUDA:
            #    for x in calculate_loss:
            #        x = x.cuda()
        print("Start training...")
        print('-' * 60)
        KLLoss = None  #nn.KLDivLoss()
        start_point = time.time()
        for epoch_idx in range(NUM_EPOCH):
            if args.optim == "sgd":
                if args.decay == "exp":
                    model_optim = exp_lr_decay(model_optim, epoch_idx)
                elif args.decay == "normal":
                    model_optim = lr_decay(model_optim, epoch_idx)
            Pre, Rec, F1, loss_list = run_epoch(model, data_holder,
                                                model_optim, calculate_loss,
                                                KLLoss, para, epoch_idx,
                                                id2task)

            use_time = time.time() - start_point
            print("Time using: %f mins" % (use_time / 60))
            if not best_F1 or best_F1 < F1:
                best_F1 = F1
                Model_s.save_model(model_path, model, para)
                print('*' * 60)
                print(
                    "Save model with average Pre: %f, Rec: %f, F1: %f on dev set."
                    % (Pre, Rec, F1))
                save_idx = epoch_idx
                print('*' * 60)
        print("save model at epoch:", save_idx)

    else:
        para_path = os.path.join(path, 'para.pkl')
        with open(para_path, "wb") as f:
            para_save = pickle.load(f)
        model = Model_s.build_model(para_save)
        model = Model_s.read_model(model_path, model)
        prec_list, rec_list, f1_list = infer(model, data_holder, "test")
Exemple #3
0
 def get_optimizer(self):
     # return AdamWOptimizer(self.named_parameters(), lr=5e-4, weight_decay=0.0001)
     return DenseSparseAdam(self.named_parameters(), lr=5e-4)
class DecoderTrainer:
    def __init__(self):
        self.decoder = SelfAttendedDecoder(vocab=data_utils.vocab, device_id=configs.decoder_device_id)
        # self.decoder.apply(init_weights)
        self.xe_loss = nn.CrossEntropyLoss(ignore_index=data_utils.vocab.padding_id)
        self.optimizer = DenseSparseAdam(self.decoder.parameters(), lr=configs.adam_lr) # , lr=configs.lr)
        # self.optimizer = optim.ASGD(self.model.parameters(), lr=configs.lr, weight_decay=configs.l2_weight_decay)
        self.lr_scheduler = lr_scheduler.ReduceLROnPlateau(
            self.optimizer, 'min',
            patience=configs.lr_scheduler_patience,
            factor=configs.lr_scheduler_factor, verbose=True
        )
        self.epoch_idx = 0
        self.min_ppl = 1000.
        self.ckpt_path = 'decoder.pretrained.params'

    def train(self):
        # self.load_ckpt()
        start_epoch_idx = self.epoch_idx

        for epoch_idx in range(start_epoch_idx, configs.epoch_num):
            self.epoch_idx = epoch_idx

            log(f'starting epoch {epoch_idx}')
            log('training')

            self.decoder.train()
            avg_epoch_loss = 0.
            batch_num = 0
            next_logging_pct = .5

            start_time = time.time()

            for pct, (report_batch, report_len_batch) in data_utils.gen_report_batches('train'):
                batch_num += 1
                self.optimizer.zero_grad()
                max_report_len, batch_size = report_batch.shape

                # [max_report_len - 1, batch_size, vocab_size]
                word_logits_seq_batch = self.decoder.run_lang_model(
                    # [max_report_len, batch_size], [batch_size]
                    report_batch, report_len_batch
                )
                loss = self.xe_loss(
                    # [(max_report_len - 1) * batch_size, vocab_size]
                    word_logits_seq_batch.view(-1, data_utils.vocab.size),
                    # [(max_report_len - 1) * batch_size]
                    report_batch[1:, :].contiguous().view(-1).to(torch.device(configs.decoder_device_id))
                )

                # print(torch.argmax(output_batch[:50, :5, :], dim=-1))

                loss.backward()
                self.optimizer.step()
                avg_epoch_loss += loss.item()

                if pct >= next_logging_pct:
                    log(
                        f'{int(pct)}%, avg_train_loss: {avg_epoch_loss / batch_num}, '
                        f'time: {time.time() - start_time}'
                    )
                    next_logging_pct += 10.

            avg_epoch_loss /= batch_num

            log(
                f'avg_train_loss: {avg_epoch_loss}\n'
                f'avg_train_time: {time.time() - start_time}'
            )

            with torch.no_grad():
                log('validating')

                self.decoder.eval()
                batch_num = 0
                avg_epoch_ppl = 0
                next_logging_pct = 10.

                start_time = time.time()

                for pct, (report_batch, report_len_batch) in data_utils.gen_report_batches('test'):
                    batch_num += 1

                    max_report_len, batch_size = report_batch.shape

                    # [max_report_len - 1, batch_size, vocab_size]
                    word_logits_seq_batch = self.decoder.run_lang_model(
                        # [max_report_len, batch_size], [batch_size]
                        report_batch, report_len_batch
                    )
                    loss = self.xe_loss(
                        # [(max_report_len - 1) * batch_size, vocab_size]
                        word_logits_seq_batch.view(-1, data_utils.vocab.size),
                        # [(max_report_len - 1) * batch_size]
                        report_batch[1:, :].contiguous().view(-1).to(torch.device(configs.decoder_device_id))
                    )

                    # print(torch.argmax(output_batch[:50, :5, :], dim=-1))

                    avg_epoch_ppl += math.exp(loss.item())

                    if pct >= next_logging_pct:
                        log(
                            f'{int(pct)}%, avg_dev_ppl: {avg_epoch_ppl / batch_num}, '
                            f'time: {time.time() - start_time}'
                        )
                        next_logging_pct += 10.

                avg_epoch_ppl /= batch_num
                self.lr_scheduler.step(avg_epoch_ppl)

                log(
                    f'avg_dev_time: {time.time() - start_time}\n'
                    f'avg_dev_ppl: {avg_epoch_ppl}'
                )

                if avg_epoch_ppl < self.min_ppl:
                    self.min_ppl = avg_epoch_ppl
                    self.save_ckpt()

    def get_ckpt(self):
        return {
            'epoch_idx': self.epoch_idx,
            'min_ppl': self.min_ppl,
            'decoder': self.decoder.state_dict(),
            'optimizer': self.optimizer.state_dict(),
            'lr_scheduler': self.lr_scheduler.state_dict()
        }

    def set_ckpt(self, ckpt_dict):
        self.epoch_idx = ckpt_dict['epoch_idx'] + 1
        self.min_ppl = ckpt_dict['min_ppl']

        if configs.uses_gumbel_softmax:
            ckpt_dict['decoder']['embedder.weight'] = \
                ckpt_dict['decoder']['embedder.weight'].to(torch.device(configs.decoder_device_id))
        else:
            ckpt_dict['decoder']['embedder.weight'] = \
                ckpt_dict['decoder']['embedder.weight'].cpu()

        self.decoder.load_state_dict(ckpt_dict['decoder'])

        if configs.uses_new_optimizer:
            self.optimizer.load_state_dict(ckpt_dict['optimizer'])
            self.lr_scheduler.load_state_dict(ckpt_dict['lr_scheduler'])
        del ckpt_dict
        torch.cuda.empty_cache()

    ckpt = property(get_ckpt, set_ckpt)

    def save_ckpt(self):
        torch.save(self.ckpt, f=self.ckpt_path)
        print(f'saved checkpoint to {self.ckpt_path}')

    def load_ckpt(self):
        self.ckpt = torch.load(self.ckpt_path)
        print(f'loaded checkpoint from {self.ckpt_path}')
Exemple #5
0
def train():
    cmd = argparse.ArgumentParser(sys.argv[0], conflict_handler='resolve')
    cmd.add_argument('--seed', default=1, type=int, help='The random seed.')
    cmd.add_argument('--gpu',
                     default=-1,
                     type=int,
                     help='Use id of gpu, -1 if cpu.')
    cmd.add_argument('--format',
                     default='plain',
                     choices=('plain', 'pickle'),
                     help='the input format.')
    cmd.add_argument('--train_path',
                     required=True,
                     help='The path to the training file.')
    cmd.add_argument('--vocab_path',
                     required=True,
                     help='The path to the vocabulary.')
    cmd.add_argument('--valid_path', help='The path to the development file.')
    cmd.add_argument('--test_path', help='The path to the testing file.')
    cmd.add_argument('--config_path',
                     required=True,
                     help='the path to the config file.')
    cmd.add_argument("--model", required=True, help="path to save model")
    cmd.add_argument("--batch_size",
                     "--batch",
                     type=int,
                     default=32,
                     help='the batch size.')
    cmd.add_argument("--max_epoch",
                     type=int,
                     default=100,
                     help='the maximum number of iteration.')
    cmd.add_argument('--max_sent_len',
                     type=int,
                     default=20,
                     help='maximum sentence length.')
    cmd.add_argument('--bucket',
                     action='store_true',
                     default=False,
                     help='do bucket batching.')
    cmd.add_argument('--save_classify_layer',
                     default=False,
                     action='store_true',
                     help="whether to save the classify layer")
    cmd.add_argument(
        '--always_save',
        default=False,
        action='store_true',
        help="always saving the model, appending global epoch id.")
    cmd.add_argument('--valid_size',
                     type=int,
                     default=0,
                     help="size of validation dataset when there's no valid.")
    cmd.add_argument('--eval_steps',
                     type=int,
                     help='evaluate every xx batches.')
    cmd.add_argument('--report_steps',
                     type=int,
                     default=32,
                     help='report every xx batches.')

    opt = cmd.parse_args(sys.argv[2:])

    with open(opt.config_path, 'r') as fin:
        conf = json.load(fin)

    # Dump configurations
    logger.info('hostname: {}'.format(socket.gethostname()))
    logger.info('cmd opt: {}'.format(opt))
    logger.info('model configuration: {}'.format(conf))

    # Set seed.
    torch.manual_seed(opt.seed)
    random.seed(opt.seed)
    np.random.seed(opt.seed)
    if opt.gpu >= 0:
        torch.cuda.set_device(opt.gpu)
        if opt.seed > 0:
            torch.cuda.manual_seed(opt.seed)

    use_cuda = opt.gpu >= 0 and torch.cuda.is_available()
    use_fp16 = False
    if conf['optimizer'].get('fp16', False):
        use_fp16 = True
        if not use_cuda:
            logger.warning(
                'WARNING: fp16 requires --gpu, ignoring fp16 option')
            conf['optimizer']['fp16'] = False
            use_fp16 = False
        else:
            try:
                from apex.fp16_utils import FP16_Optimizer
            except:
                print('WARNING: apex not installed, ignoring --fp16 option')
                conf['optimizer']['fp16'] = False
                use_fp16 = False

    c = conf['token_embedder']
    token_embedder_max_chars = c.get('max_characters_per_token', None)

    # Load training data.
    if opt.format == 'plain':
        raw_training_data = read_corpus(opt.train_path, opt.max_sent_len,
                                        token_embedder_max_chars)
    else:
        raw_training_data = pickle.load(open(opt.train_path, 'rb'))
    logger.info('training instance: {}, training tokens: {}.'.format(
        len(raw_training_data), count_tokens(raw_training_data)))

    # Load valid data if path is provided, else use 10% of training data as valid data
    if opt.valid_path is not None:
        if opt.format == 'plain':
            raw_valid_data = read_corpus(opt.valid_path, opt.max_sent_len,
                                         token_embedder_max_chars)
        else:
            raw_valid_data = pickle.load(open(opt.valid_path, 'rb'))
        logger.info('valid instance: {}, valid tokens: {}.'.format(
            len(raw_valid_data), count_tokens(raw_valid_data)))
    elif opt.valid_size > 0:
        raw_training_data, raw_valid_data = split_train_and_valid(
            raw_training_data, opt.valid_size)
        logger.info(
            'training instance: {}, training tokens after division: {}.'.
            format(len(raw_training_data), count_tokens(raw_training_data)))
        logger.info('valid instance: {}, valid tokens: {}.'.format(
            len(raw_valid_data), count_tokens(raw_valid_data)))
    else:
        raw_valid_data = None

    # Load test data if path is provided.
    if opt.test_path is not None:
        if opt.format == 'plain':
            raw_test_data = read_corpus(opt.test_path, opt.max_sent_len,
                                        token_embedder_max_chars)
        else:
            raw_test_data = pickle.load(open(opt.test_path, 'rb'))
        logger.info('testing instance: {}, testing tokens: {}.'.format(
            len(raw_test_data), count_tokens(raw_test_data)))
    else:
        raw_test_data = None

    # Initialized vocab_batch
    vocab_batch = VocabBatch(
        lower=not conf['classifier'].get('vocab_cased', True),
        normalize_digits=conf['classifier'].get('vocab_digit_normalized',
                                                False),
        use_cuda=use_cuda)
    vocab_batch.create_dict_from_file(opt.vocab_path)

    # Word
    if c.get('word_dim', 0) > 0:
        word_batch = WordBatch(min_cut=c.get('word_min_cut', 0),
                               lower=not c.get('word_cased', True),
                               add_sentence_boundary=c.get(
                                   'add_sentence_boundary_ids', False),
                               use_cuda=use_cuda)
        word_batch.create_dict_from_dataset(raw_training_data)
    else:
        word_batch = None

    # SubToken
    if c.get('char_dim', 0) > 0 or c.get('wordpiece_dim', 0) > 0:
        min_char = max([w
                        for w, _ in c['filters']]) if c['name'] == 'cnn' else 1
        if c.get('char_dim', 0) > 0:
            char_batch = CharacterBatch(
                min_char=min_char,
                lower=not c.get('char_cased', True),
                add_sentence_boundary=c.get('add_sentence_boundary_ids',
                                            False),
                add_word_boundary=c.get('add_word_boundary_ids', False),
                use_cuda=use_cuda)
        else:
            char_batch = WordPieceBatch(
                min_char=min_char,
                vocab_file=c['wordpiece_vocab'],
                lower=not c.get('word_cased', True),
                add_sentence_boundary=c.get('add_sentence_boundary_ids',
                                            False),
                add_word_boundary=c.get('add_word_boundary_ids', False),
                use_cuda=use_cuda)
        char_batch.create_dict_from_dataset(raw_training_data)
    else:
        char_batch = None

    logger.info('vocab size: {0}'.format(len(vocab_batch.mapping)))
    n_classes = len(vocab_batch.mapping)

    model = Model(conf, word_batch, char_batch, n_classes)

    logger.info(str(model))
    if use_cuda:
        model = model.cuda()
        if use_fp16:
            model = model.half()

    # Create training batch
    if opt.bucket:
        training_batcher = BucketBatcher(raw_training_data, word_batch,
                                         char_batch, vocab_batch,
                                         opt.batch_size)
    else:
        training_batcher = Batcher(raw_training_data,
                                   word_batch,
                                   char_batch,
                                   vocab_batch,
                                   opt.batch_size,
                                   keep_full=False,
                                   sorting=True,
                                   shuffle=True)

    # Set up evaluation steps.
    if opt.eval_steps is None:
        opt.eval_steps = training_batcher.num_batches()
    logger.info('Evaluate every {0} batches.'.format(opt.eval_steps))

    # If there is valid, create valid batch.
    if raw_valid_data is not None:
        valid_batcher = Batcher(raw_valid_data,
                                word_batch,
                                char_batch,
                                vocab_batch,
                                opt.batch_size,
                                keep_full=True,
                                sorting=True,
                                shuffle=False)
    else:
        valid_batcher = None

    # If there is test, create test batch.
    if raw_test_data is not None:
        test_batcher = Batcher(raw_test_data,
                               word_batch,
                               char_batch,
                               vocab_batch,
                               opt.batch_size,
                               keep_full=True,
                               sorting=True,
                               shuffle=False)
    else:
        test_batcher = None

    # Save meta data of
    try:
        os.makedirs(opt.model)
    except OSError as exception:
        if exception.errno != errno.EEXIST:
            raise

    if char_batch is not None:
        with codecs.open(os.path.join(opt.model, 'char.dic'),
                         'w',
                         encoding='utf-8') as fpo:
            for ch, i in char_batch.mapping.items():
                print('{0}\t{1}'.format(ch, i), file=fpo)

    if word_batch is not None:
        with codecs.open(os.path.join(opt.model, 'word.dic'),
                         'w',
                         encoding='utf-8') as fpo:
            for w, i in word_batch.mapping.items():
                print('{0}\t{1}'.format(w, i), file=fpo)

    with codecs.open(os.path.join(opt.model, 'vocab.dic'),
                     'w',
                     encoding='utf-8') as fpo:
        for w, i in vocab_batch.mapping.items():
            print('{0}\t{1}'.format(w, i), file=fpo)

    new_config_path = os.path.join(opt.model,
                                   os.path.basename(opt.config_path))
    shutil.copy(opt.config_path, new_config_path)
    opt.config_path = new_config_path
    json.dump(
        vars(opt),
        codecs.open(os.path.join(opt.model, 'config.json'),
                    'w',
                    encoding='utf-8'))

    c = conf['optimizer']

    optimizer_name = c['type'].lower()
    params = filter(lambda param: param.requires_grad, model.parameters())
    if optimizer_name == 'adamax':
        optimizer = torch.optim.Adamax(params,
                                       lr=c.get('lr', 2e-3),
                                       betas=c.get('betas', (0.9, 0.999)),
                                       eps=c.get('eps', 1e-8))
    elif optimizer_name == 'sgd':
        optimizer = torch.optim.SGD(params,
                                    lr=c.get('lr', 0.01),
                                    momentum=c.get('momentum', 0),
                                    nesterov=c.get('nesterov', False))
    elif optimizer_name == 'dense_sparse_adam' or optimizer_name == 'adam':
        optimizer = DenseSparseAdam(params,
                                    lr=c.get('lr', 1e-3),
                                    betas=c.get('betas', (0.9, 0.999)),
                                    eps=c.get('eps', 1e-8))
    else:
        raise ValueError('Unknown optimizer name: {0}'.format(optimizer_name))

    if use_fp16:
        optimizer = FP16_Optimizer(optimizer,
                                   static_loss_scale=1.,
                                   dynamic_loss_scale=True,
                                   dynamic_loss_args={'init_scale': 2**16})

    scheduler_name = c.get('scheduler', 'noam')
    if scheduler_name == 'cosine':
        scheduler = CosineWithRestarts(optimizer,
                                       c['max_step'],
                                       eta_min=c.get('eta_min', 0.0))
    elif scheduler_name == 'dev_perf':
        scheduler = LearningRateScheduler.by_name('reduce_on_plateau')(
            optimizer,
            factor=c.get('decay_rate', 0.5),
            patience=c.get('patience', 5),
            min_lr=c.get('lr_min', 1e-6))
    elif scheduler_name == 'noam':
        scheduler = NoamLR(optimizer,
                           model_size=c.get('model_size', 512),
                           warmup_steps=c.get('warmup_step', 6000))
    else:
        scheduler = None

    best_train, best_valid, test_result = 1e8, 1e8, 1e8
    for epoch in range(opt.max_epoch):
        best_train, best_valid, test_result, improved = train_model(
            epoch, conf, opt, model, optimizer, scheduler, training_batcher,
            valid_batcher, test_batcher, best_train, best_valid, test_result)

    if raw_valid_data is None:
        logger.info("best train ppl: {:.6f}.".format(best_train))
    elif raw_test_data is None:
        logger.info("best train ppl: {:.6f}, best valid ppl: {:.6f}.".format(
            best_train, best_valid))
    else:
        logger.info(
            "best train ppl: {:.6f}, best valid ppl: {:.6f}, test ppl: {:.6f}."
            .format(best_train, best_valid, test_result))
Exemple #6
0
 def get_optimizer(self):
     return DenseSparseAdam(self.named_parameters(), lr=5e-4)
     # return AdadeltaOptimizer(self.named_parameters())