def get_model(args):
    ntokens = args.data_size
    concat_pools = [args.concat_max, args.concat_min, args.concat_mean]
    if args.model.lower() == 'transformer':
        model = TransformerFeaturizer(False, args)
    else:
        model = RNNFeaturizer(args.model, ntokens, args.emsize, args.nhid, args.nlayers,
                                          0.0, args.all_layers, concat_pools, residuals=args.residuals)
    if args.cuda:
        model.cuda()

    if args.fp16:
        model.half()

    if args.load is not None and args.load != '':
        # load char embedding and recurrent encoder for featurization
        with open(args.load, 'rb') as f:
            sd = x = torch.load(f)
            if 'sd' in sd:
                sd = sd['sd']
            if 'lm_encoder' in sd:
                sd = sd['lm_encoder']
        try:
            model.load_state_dict(sd)
        except:
            # if state dict has weight normalized parameters apply and remove weight norm to model while loading sd
            if hasattr(model, 'rnn'):
                apply_weight_norm(model.rnn)
            else:
                apply_weight_norm(model)
            model.load_state_dict(sd)
            remove_weight_norm(model)

    return model
	def apply_weight_norm(self):
		"""applies weight norm to all module parameters"""
		# if lstm_only apply weight norm only to the lienar gates of lstm
		if self.lstm_only:
			apply_weight_norm(self.module.rnn.layers)
		else:
			apply_weight_norm(self.module)
		self.weight_norm = True
Exemple #3
0
def get_model(args):

    sd = None
    model_args = args
    if args.load is not None and args.load != '':
        sd = torch.load(args.load)
        if 'args' in sd:
            model_args = sd['args']
        if 'sd' in sd:
            sd = sd['sd']

    ntokens = model_args.data_size
    concat_pools = model_args.concat_max, model_args.concat_min, model_args.concat_mean
    if args.model == 'transformer':
        model = SentimentClassifier(model_args.model, ntokens, None, None,
                                    None, model_args.classifier_hidden_layers,
                                    model_args.classifier_dropout, None,
                                    concat_pools, False, model_args)
    else:
        model = SentimentClassifier(
            model_args.model, ntokens, model_args.emsize, model_args.nhid,
            model_args.nlayers, model_args.classifier_hidden_layers,
            model_args.classifier_dropout, model_args.all_layers, concat_pools,
            False, model_args)
    args.heads_per_class = model_args.heads_per_class
    args.use_softmax = model_args.use_softmax
    try:
        args.classes = list(model_args.classes)
    except:
        args.classes = [args.label_key]

    try:
        args.dual_thresh = model_args.dual_thresh and not model_args.joint_binary_train
    except:
        args.dual_thresh = False

    if args.cuda:
        model.cuda()

    if args.fp16:
        model.half()

    if sd is not None:
        try:
            model.load_state_dict(sd)
        except:
            # if state dict has weight normalized parameters apply and remove weight norm to model while loading sd
            if hasattr(model.lm_encoder, 'rnn'):
                apply_weight_norm(model.lm_encoder.rnn)
            else:
                apply_weight_norm(model.lm_encoder)
            model.lm_encoder.load_state_dict(sd)
            remove_weight_norm(model)

    if args.neurons > 0:
        print('WARNING. Setting neurons %s' % str(args.neurons))
        model.set_neurons(args.neurons)
    return model
Exemple #4
0
 def apply_weight_norm(self):
     """applies weight norm to all module parameters"""
     # if lstm_only apply weight norm only to the lienar gates of lstm
     if self.lstm_only:
         [
             apply_weight_norm(m, hook_child=False)
             for m in self.module.rnn.layers
         ]
     else:
         apply_weight_norm(self.module, hook_child=False)
     self.weight_norm = True
def get_model_and_optim(args, train_data):
    if args.use_softmax:
        args.report_no_thresholding = True
    ntokens = args.data_size
    concat_pools = args.concat_max, args.concat_min, args.concat_mean
    if args.model == 'transformer':
        model = M.SentimentClassifier(
            model_type=args.model,
            ntoken=ntokens,
            ninp=None,
            nhid=None,
            nlayers=None,
            classifier_hidden_layers=args.classifier_hidden_layers,
            dropout=args.classifier_dropout,
            all_layers=None,
            concat_pools=concat_pools,
            get_lm_out=args.aux_lm_loss,
            args=args,
        )
    else:
        model = M.SentimentClassifier(
            model_type=args.model,
            ntoken=ntokens,
            ninp=args.emsize,
            nhid=args.nhid,
            nlayers=args.nlayers,
            classifier_hidden_layers=args.classifier_hidden_layers,
            dropout=args.classifier_dropout,
            all_layers=args.all_layers,
            concat_pools=concat_pools,
            get_lm_out=args.aux_lm_loss,
            args=args,
        )
    if args.cuda:
        model.cuda()

    if args.fp16:
        model.half()
    # load char embedding and recurrent encoder for featurization
    if args.load is not None and args.load != '':
        with open(args.load, 'rb') as f:
            sd = x = torch.load(f, 'cpu')
            if 'sd' in sd:
                sd = sd['sd']

        if not args.load_finetuned:
            if 'lm_encoder' in sd:
                sd = sd['lm_encoder']
            try:
                model.lm_encoder.load_state_dict(sd)
            except:
                # if state dict has weight normalized parameters apply and remove weight norm to model while loading sd
                if hasattr(model.lm_encoder, 'rnn'):
                    apply_weight_norm(model.lm_encoder.rnn)
                else:
                    apply_weight_norm(model.lm_encoder)
                model.lm_encoder.load_state_dict(sd)
                remove_weight_norm(model)
        else:
            model.load_state_dict(sd)

    if args.thresh_test_preds:
        model.set_thresholds(
            pd.read_csv(args.thresh_test_preds,
                        header=None).values.squeeze(), args.double_thresh,
            args.dual_thresh and not args.joint_binary_train)

    optims = {'adam': 'Adam', 'sgd': 'SGD'}

    optim = eval('torch.optim.' + optims[args.optim.lower()])(
        model.parameters(), lr=args.lr)
    iters_per_epoch = len(train_data)
    num_iters = iters_per_epoch * args.epochs

    assert not (args.stlr_cut_frac and args.cos_cut_frac)
    if args.stlr_cut_frac is not None:
        LR = SlantedTriangularLR(optim,
                                 max_val=args.lr,
                                 cut_frac=args.stlr_cut_frac,
                                 num_iters=num_iters)
    elif args.cos_cut_frac is not None:
        LR = AnnealingLR(optim,
                         start_lr=args.lr,
                         warmup_iter=int(args.cos_cut_frac * num_iters),
                         num_iters=num_iters,
                         decay_style='cosine')
    elif args.decay_style is not None:
        warmup_iters = int(args.warmup_epochs * iters_per_epoch)
        if args.decay_epochs == -1:
            decay_iters = int(args.epochs * iters_per_epoch)
        else:
            decay_iters = int(args.decay_epochs * iters_per_epoch)
        if args.decay_style == 'constant':
            #TODO: implement
            LR = AnnealingLR(optim,
                             start_lr=args.lr,
                             warmup_iter=warmup_iters,
                             num_iters=decay_iters + warmup_iters,
                             decay_style=args.decay_style)
        elif args.decay_style == 'linear':
            #TODO: implement
            LR = AnnealingLR(optim,
                             start_lr=args.lr,
                             warmup_iter=warmup_iters,
                             num_iters=decay_iters + warmup_iters,
                             decay_style=args.decay_style)
        elif args.decay_style == 'cosine':
            LR = AnnealingLR(optim,
                             start_lr=args.lr,
                             warmup_iter=warmup_iters,
                             num_iters=decay_iters + warmup_iters,
                             decay_style=args.decay_style)
        elif args.decay_style == 'exponential':
            #TODO: implement
            LR = ConstantLR(optim, lr=args.lr)
        else:
            LR = ConstantLR(optim, lr=args.lr)
    else:
        LR = ConstantLR(optim, lr=args.lr)
    return model, optim, LR
Exemple #6
0
def setup_model_and_optim(args, train_data, tokenizer):
    ntokens = args.data_size
    if args.model.lower() == 'transformer':
        embed_tokens = m.Embedding(
            ntokens,
            args.decoder_embed_dim,
            padding_idx=tokenizer.command_name_map['pad'].Id)
        model = m.TransformerModel(m.DecoderPreprocessor(args, embed_tokens),
                                   m.TransformerDecoder(args, embed_tokens))
    else:
        model = m.RNNModel(args.model, ntokens, args.emsize, args.nhid,
                           args.nlayers, args.dropout, args.tied)
        global rnn_model
        rnn_model = model
    LR_Warmer = None
    print('* number of parameters: %d' %
          sum([p.nelement() for p in model.parameters()]))
    if args.cuda:
        model.cuda()

    optim = None
    if args.load is not None and args.load != '':
        sd = torch.load(args.load, map_location='cpu')
        if args.load_optim:
            #optim_sd = torch.load(os.path.join(os.path.dirname(args.load), 'optim.pt'), map_location='cpu')
            rng = torch.load(os.path.join(os.path.dirname(args.load),
                                          'rng.pt'))
            torch.cuda.set_rng_state(rng[0])
            torch.set_rng_state(rng[1])
        try:
            model.load_state_dict(sd)
        except:
            if hasattr(model, 'rnn'):
                apply_weight_norm(model.rnn, hook_child=False)
            else:
                apply_weight_norm(model, hook_child=False)
            model.load_state_dict(sd)
            remove_weight_norm(model)

    if not args.no_weight_norm:
        if hasattr(model, 'rnn'):
            apply_weight_norm(model.rnn, hook_child=False)
        else:
            apply_weight_norm(model, hook_child=False)

    if optim is None:
        optim_choice = 'Adam' if args.stlr_cut_frac else args.optim
        if args.fp16:
            model = FP16_Module(model)
            optim = eval('torch.optim.' + args.optim)(model.parameters(),
                                                      lr=args.lr)
            optim = FP16_Optimizer(optim,
                                   static_loss_scale=args.loss_scale,
                                   dynamic_loss_scale=args.dynamic_loss_scale)
        else:
            optim = eval('torch.optim.' + args.optim)(model.parameters(),
                                                      lr=args.lr)

    if args.load_optim:
        optim.load_state_dict(optim_sd)

    # add linear learning rate scheduler
    if train_data is not None:
        if args.constant_decay:
            num_iters = args.constant_decay
        else:
            num_iters = args.train_iters * args.epochs

        init_step = -1
        if args.load_optim:
            #TODO: this no longer makes sense given the new data loaders
            init_step = optim_sd['iter'] - optim_sd['skipped_iter']
            train_data.batch_sampler.start_iter = (optim_sd['iter'] %
                                                   len(train_data)) + 1

        warmup_iter = args.warmup * num_iters

        if args.stlr_cut_frac is not None:
            LR = SlantedTriangularLR(optim,
                                     cut_frac=args.stlr_cut_frac,
                                     num_iters=num_iters)
        else:
            LR = AnnealingLR(optim,
                             start_lr=args.lr,
                             warmup_iter=warmup_iter,
                             num_iters=num_iters,
                             decay_style=args.decay_style)

        if args.warmup != 0:
            LR_Warmer = WarmupLR(optim, warmup_iter, last_iter=init_step)

    # wrap model for distributed training
    if args.world_size > 1:
        model = DDP(model)

    criterion = nn.CrossEntropyLoss(reduce=False)
    return model, optim, LR, LR_Warmer, criterion