Ejemplo n.º 1
0
 def test_registry_has_builtin_learning_rate_schedulers(self):
     all_schedulers = {
             "step": torch.optim.lr_scheduler.StepLR,
             "multi_step": torch.optim.lr_scheduler.MultiStepLR,
             "exponential": torch.optim.lr_scheduler.ExponentialLR,
             "reduce_on_plateau": torch.optim.lr_scheduler.ReduceLROnPlateau
     }
     for key, value in all_schedulers.items():
         assert LearningRateScheduler.by_name(key) == value
Ejemplo n.º 2
0
 def test_registry_has_builtin_learning_rate_schedulers(self):
     all_schedulers = {
             "step": torch.optim.lr_scheduler.StepLR,
             "multi_step": torch.optim.lr_scheduler.MultiStepLR,
             "exponential": torch.optim.lr_scheduler.ExponentialLR,
             "reduce_on_plateau": torch.optim.lr_scheduler.ReduceLROnPlateau
     }
     for key, value in all_schedulers.items():
         assert LearningRateScheduler.by_name(key) == value
Ejemplo n.º 3
0
def train():
    cmd = argparse.ArgumentParser(sys.argv[0], conflict_handler='resolve')
    cmd.add_argument('--seed', default=1, type=int, help='The random seed.')
    cmd.add_argument('--gpu',
                     default=-1,
                     type=int,
                     help='Use id of gpu, -1 if cpu.')
    cmd.add_argument('--format',
                     default='plain',
                     choices=('plain', 'pickle'),
                     help='the input format.')
    cmd.add_argument('--train_path',
                     required=True,
                     help='The path to the training file.')
    cmd.add_argument('--vocab_path',
                     required=True,
                     help='The path to the vocabulary.')
    cmd.add_argument('--valid_path', help='The path to the development file.')
    cmd.add_argument('--test_path', help='The path to the testing file.')
    cmd.add_argument('--config_path',
                     required=True,
                     help='the path to the config file.')
    cmd.add_argument("--model", required=True, help="path to save model")
    cmd.add_argument("--batch_size",
                     "--batch",
                     type=int,
                     default=32,
                     help='the batch size.')
    cmd.add_argument("--max_epoch",
                     type=int,
                     default=100,
                     help='the maximum number of iteration.')
    cmd.add_argument('--max_sent_len',
                     type=int,
                     default=20,
                     help='maximum sentence length.')
    cmd.add_argument('--bucket',
                     action='store_true',
                     default=False,
                     help='do bucket batching.')
    cmd.add_argument('--save_classify_layer',
                     default=False,
                     action='store_true',
                     help="whether to save the classify layer")
    cmd.add_argument(
        '--always_save',
        default=False,
        action='store_true',
        help="always saving the model, appending global epoch id.")
    cmd.add_argument('--valid_size',
                     type=int,
                     default=0,
                     help="size of validation dataset when there's no valid.")
    cmd.add_argument('--eval_steps',
                     type=int,
                     help='evaluate every xx batches.')
    cmd.add_argument('--report_steps',
                     type=int,
                     default=32,
                     help='report every xx batches.')

    opt = cmd.parse_args(sys.argv[2:])

    with open(opt.config_path, 'r') as fin:
        conf = json.load(fin)

    # Dump configurations
    logger.info('hostname: {}'.format(socket.gethostname()))
    logger.info('cmd opt: {}'.format(opt))
    logger.info('model configuration: {}'.format(conf))

    # Set seed.
    torch.manual_seed(opt.seed)
    random.seed(opt.seed)
    np.random.seed(opt.seed)
    if opt.gpu >= 0:
        torch.cuda.set_device(opt.gpu)
        if opt.seed > 0:
            torch.cuda.manual_seed(opt.seed)

    use_cuda = opt.gpu >= 0 and torch.cuda.is_available()
    use_fp16 = False
    if conf['optimizer'].get('fp16', False):
        use_fp16 = True
        if not use_cuda:
            logger.warning(
                'WARNING: fp16 requires --gpu, ignoring fp16 option')
            conf['optimizer']['fp16'] = False
            use_fp16 = False
        else:
            try:
                from apex.fp16_utils import FP16_Optimizer
            except:
                print('WARNING: apex not installed, ignoring --fp16 option')
                conf['optimizer']['fp16'] = False
                use_fp16 = False

    c = conf['token_embedder']
    token_embedder_max_chars = c.get('max_characters_per_token', None)

    # Load training data.
    if opt.format == 'plain':
        raw_training_data = read_corpus(opt.train_path, opt.max_sent_len,
                                        token_embedder_max_chars)
    else:
        raw_training_data = pickle.load(open(opt.train_path, 'rb'))
    logger.info('training instance: {}, training tokens: {}.'.format(
        len(raw_training_data), count_tokens(raw_training_data)))

    # Load valid data if path is provided, else use 10% of training data as valid data
    if opt.valid_path is not None:
        if opt.format == 'plain':
            raw_valid_data = read_corpus(opt.valid_path, opt.max_sent_len,
                                         token_embedder_max_chars)
        else:
            raw_valid_data = pickle.load(open(opt.valid_path, 'rb'))
        logger.info('valid instance: {}, valid tokens: {}.'.format(
            len(raw_valid_data), count_tokens(raw_valid_data)))
    elif opt.valid_size > 0:
        raw_training_data, raw_valid_data = split_train_and_valid(
            raw_training_data, opt.valid_size)
        logger.info(
            'training instance: {}, training tokens after division: {}.'.
            format(len(raw_training_data), count_tokens(raw_training_data)))
        logger.info('valid instance: {}, valid tokens: {}.'.format(
            len(raw_valid_data), count_tokens(raw_valid_data)))
    else:
        raw_valid_data = None

    # Load test data if path is provided.
    if opt.test_path is not None:
        if opt.format == 'plain':
            raw_test_data = read_corpus(opt.test_path, opt.max_sent_len,
                                        token_embedder_max_chars)
        else:
            raw_test_data = pickle.load(open(opt.test_path, 'rb'))
        logger.info('testing instance: {}, testing tokens: {}.'.format(
            len(raw_test_data), count_tokens(raw_test_data)))
    else:
        raw_test_data = None

    # Initialized vocab_batch
    vocab_batch = VocabBatch(
        lower=not conf['classifier'].get('vocab_cased', True),
        normalize_digits=conf['classifier'].get('vocab_digit_normalized',
                                                False),
        use_cuda=use_cuda)
    vocab_batch.create_dict_from_file(opt.vocab_path)

    # Word
    if c.get('word_dim', 0) > 0:
        word_batch = WordBatch(min_cut=c.get('word_min_cut', 0),
                               lower=not c.get('word_cased', True),
                               add_sentence_boundary=c.get(
                                   'add_sentence_boundary_ids', False),
                               use_cuda=use_cuda)
        word_batch.create_dict_from_dataset(raw_training_data)
    else:
        word_batch = None

    # SubToken
    if c.get('char_dim', 0) > 0 or c.get('wordpiece_dim', 0) > 0:
        min_char = max([w
                        for w, _ in c['filters']]) if c['name'] == 'cnn' else 1
        if c.get('char_dim', 0) > 0:
            char_batch = CharacterBatch(
                min_char=min_char,
                lower=not c.get('char_cased', True),
                add_sentence_boundary=c.get('add_sentence_boundary_ids',
                                            False),
                add_word_boundary=c.get('add_word_boundary_ids', False),
                use_cuda=use_cuda)
        else:
            char_batch = WordPieceBatch(
                min_char=min_char,
                vocab_file=c['wordpiece_vocab'],
                lower=not c.get('word_cased', True),
                add_sentence_boundary=c.get('add_sentence_boundary_ids',
                                            False),
                add_word_boundary=c.get('add_word_boundary_ids', False),
                use_cuda=use_cuda)
        char_batch.create_dict_from_dataset(raw_training_data)
    else:
        char_batch = None

    logger.info('vocab size: {0}'.format(len(vocab_batch.mapping)))
    n_classes = len(vocab_batch.mapping)

    model = Model(conf, word_batch, char_batch, n_classes)

    logger.info(str(model))
    if use_cuda:
        model = model.cuda()
        if use_fp16:
            model = model.half()

    # Create training batch
    if opt.bucket:
        training_batcher = BucketBatcher(raw_training_data, word_batch,
                                         char_batch, vocab_batch,
                                         opt.batch_size)
    else:
        training_batcher = Batcher(raw_training_data,
                                   word_batch,
                                   char_batch,
                                   vocab_batch,
                                   opt.batch_size,
                                   keep_full=False,
                                   sorting=True,
                                   shuffle=True)

    # Set up evaluation steps.
    if opt.eval_steps is None:
        opt.eval_steps = training_batcher.num_batches()
    logger.info('Evaluate every {0} batches.'.format(opt.eval_steps))

    # If there is valid, create valid batch.
    if raw_valid_data is not None:
        valid_batcher = Batcher(raw_valid_data,
                                word_batch,
                                char_batch,
                                vocab_batch,
                                opt.batch_size,
                                keep_full=True,
                                sorting=True,
                                shuffle=False)
    else:
        valid_batcher = None

    # If there is test, create test batch.
    if raw_test_data is not None:
        test_batcher = Batcher(raw_test_data,
                               word_batch,
                               char_batch,
                               vocab_batch,
                               opt.batch_size,
                               keep_full=True,
                               sorting=True,
                               shuffle=False)
    else:
        test_batcher = None

    # Save meta data of
    try:
        os.makedirs(opt.model)
    except OSError as exception:
        if exception.errno != errno.EEXIST:
            raise

    if char_batch is not None:
        with codecs.open(os.path.join(opt.model, 'char.dic'),
                         'w',
                         encoding='utf-8') as fpo:
            for ch, i in char_batch.mapping.items():
                print('{0}\t{1}'.format(ch, i), file=fpo)

    if word_batch is not None:
        with codecs.open(os.path.join(opt.model, 'word.dic'),
                         'w',
                         encoding='utf-8') as fpo:
            for w, i in word_batch.mapping.items():
                print('{0}\t{1}'.format(w, i), file=fpo)

    with codecs.open(os.path.join(opt.model, 'vocab.dic'),
                     'w',
                     encoding='utf-8') as fpo:
        for w, i in vocab_batch.mapping.items():
            print('{0}\t{1}'.format(w, i), file=fpo)

    new_config_path = os.path.join(opt.model,
                                   os.path.basename(opt.config_path))
    shutil.copy(opt.config_path, new_config_path)
    opt.config_path = new_config_path
    json.dump(
        vars(opt),
        codecs.open(os.path.join(opt.model, 'config.json'),
                    'w',
                    encoding='utf-8'))

    c = conf['optimizer']

    optimizer_name = c['type'].lower()
    params = filter(lambda param: param.requires_grad, model.parameters())
    if optimizer_name == 'adamax':
        optimizer = torch.optim.Adamax(params,
                                       lr=c.get('lr', 2e-3),
                                       betas=c.get('betas', (0.9, 0.999)),
                                       eps=c.get('eps', 1e-8))
    elif optimizer_name == 'sgd':
        optimizer = torch.optim.SGD(params,
                                    lr=c.get('lr', 0.01),
                                    momentum=c.get('momentum', 0),
                                    nesterov=c.get('nesterov', False))
    elif optimizer_name == 'dense_sparse_adam' or optimizer_name == 'adam':
        optimizer = DenseSparseAdam(params,
                                    lr=c.get('lr', 1e-3),
                                    betas=c.get('betas', (0.9, 0.999)),
                                    eps=c.get('eps', 1e-8))
    else:
        raise ValueError('Unknown optimizer name: {0}'.format(optimizer_name))

    if use_fp16:
        optimizer = FP16_Optimizer(optimizer,
                                   static_loss_scale=1.,
                                   dynamic_loss_scale=True,
                                   dynamic_loss_args={'init_scale': 2**16})

    scheduler_name = c.get('scheduler', 'noam')
    if scheduler_name == 'cosine':
        scheduler = CosineWithRestarts(optimizer,
                                       c['max_step'],
                                       eta_min=c.get('eta_min', 0.0))
    elif scheduler_name == 'dev_perf':
        scheduler = LearningRateScheduler.by_name('reduce_on_plateau')(
            optimizer,
            factor=c.get('decay_rate', 0.5),
            patience=c.get('patience', 5),
            min_lr=c.get('lr_min', 1e-6))
    elif scheduler_name == 'noam':
        scheduler = NoamLR(optimizer,
                           model_size=c.get('model_size', 512),
                           warmup_steps=c.get('warmup_step', 6000))
    else:
        scheduler = None

    best_train, best_valid, test_result = 1e8, 1e8, 1e8
    for epoch in range(opt.max_epoch):
        best_train, best_valid, test_result, improved = train_model(
            epoch, conf, opt, model, optimizer, scheduler, training_batcher,
            valid_batcher, test_batcher, best_train, best_valid, test_result)

    if raw_valid_data is None:
        logger.info("best train ppl: {:.6f}.".format(best_train))
    elif raw_test_data is None:
        logger.info("best train ppl: {:.6f}, best valid ppl: {:.6f}.".format(
            best_train, best_valid))
    else:
        logger.info(
            "best train ppl: {:.6f}, best valid ppl: {:.6f}, test ppl: {:.6f}."
            .format(best_train, best_valid, test_result))