def get_model(args): sd = None model_args = args if args.load is not None and args.load != '': # sd = torch.load(args.load, map_location=lambda storage, location: 'cpu') device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') sd = torch.load(args.load, map_location=device) if 'args' in sd: model_args = sd['args'] if 'sd' in sd: sd = sd['sd'] ntokens = model_args.data_size concat_pools = model_args.concat_max, model_args.concat_min, model_args.concat_mean if args.model == 'transformer': model = SentimentClassifier(model_args.model, ntokens, None, None, None, model_args.classifier_hidden_layers, model_args.classifier_dropout, None, concat_pools, False, model_args) else: model = SentimentClassifier( model_args.model, ntokens, model_args.emsize, model_args.nhid, model_args.nlayers, model_args.classifier_hidden_layers, model_args.classifier_dropout, model_args.all_layers, concat_pools, False, model_args) args.heads_per_class = model_args.heads_per_class args.use_softmax = model_args.use_softmax try: args.classes = list(model_args.classes) except: args.classes = [args.label_key] try: args.dual_thresh = model_args.dual_thresh and not model_args.joint_binary_train except: args.dual_thresh = False if args.cuda: model.cuda() if args.fp16: model.half() if sd is not None: try: model.load_state_dict(sd) except: # if state dict has weight normalized parameters apply and remove weight norm to model while loading sd if hasattr(model.lm_encoder, 'rnn'): apply_weight_norm(model.lm_encoder.rnn) else: apply_weight_norm(model.lm_encoder) model.lm_encoder.load_state_dict(sd) remove_weight_norm(model) if args.neurons > 0: print('WARNING. Setting neurons %s' % str(args.neurons)) model.set_neurons(args.neurons) return model
def get_model(args): ntokens = args.data_size concat_pools = [args.concat_max, args.concat_min, args.concat_mean] if args.model.lower() == 'transformer': model = TransformerFeaturizer(False, args) else: model = RNNFeaturizer(args.model, ntokens, args.emsize, args.nhid, args.nlayers, 0.0, args.all_layers, concat_pools, residuals=args.residuals) if args.cuda: model.cuda() if args.fp16: model.half() if args.load is not None and args.load != '': # load char embedding and recurrent encoder for featurization with open(args.load, 'rb') as f: sd = x = torch.load(f) if 'sd' in sd: sd = sd['sd'] if 'lm_encoder' in sd: sd = sd['lm_encoder'] try: model.load_state_dict(sd) except: # if state dict has weight normalized parameters apply and remove weight norm to model while loading sd if hasattr(model, 'rnn'): apply_weight_norm(model.rnn) else: apply_weight_norm(model) model.load_state_dict(sd) remove_weight_norm(model) return model
#if args.temperature < 1e-3: # parser.error("--temperature has to be greater or equal 1e-3") model = model.RNNModel(args.model, args.data_size, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied) if args.cuda: model.cuda() if args.fp16: model.half() with open(args.load_model, 'rb') as f: sd = torch.load(f) try: model.load_state_dict(sd) except: apply_weight_norm(model.rnn) model.load_state_dict(sd) remove_weight_norm(model) def get_neuron_and_polarity(sd, neuron): """return a +/- 1 indicating the polarity of the specified neuron in the module""" if neuron == -1: neuron = None if 'classifier' in sd: sd = sd['classifier'] if 'weight' in sd: weight = sd['weight'] else: return neuron, 1 else: return neuron, 1
def setup_model_and_optim(args, train_data, tokenizer): ntokens = args.data_size if args.model.lower() == 'transformer': embed_tokens = m.Embedding( ntokens, args.decoder_embed_dim, padding_idx=tokenizer.command_name_map['pad'].Id) model = m.TransformerModel(m.DecoderPreprocessor(args, embed_tokens), m.TransformerDecoder(args, embed_tokens)) else: model = m.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied) global rnn_model rnn_model = model LR_Warmer = None print('* number of parameters: %d' % sum([p.nelement() for p in model.parameters()])) if args.cuda: model.cuda() optim = None if args.load is not None and args.load != '': sd = torch.load(args.load, map_location='cpu') if args.load_optim: #optim_sd = torch.load(os.path.join(os.path.dirname(args.load), 'optim.pt'), map_location='cpu') rng = torch.load(os.path.join(os.path.dirname(args.load), 'rng.pt')) torch.cuda.set_rng_state(rng[0]) torch.set_rng_state(rng[1]) try: model.load_state_dict(sd) except: if hasattr(model, 'rnn'): apply_weight_norm(model.rnn, hook_child=False) else: apply_weight_norm(model, hook_child=False) model.load_state_dict(sd) remove_weight_norm(model) if not args.no_weight_norm: if hasattr(model, 'rnn'): apply_weight_norm(model.rnn, hook_child=False) else: apply_weight_norm(model, hook_child=False) if optim is None: optim_choice = 'Adam' if args.stlr_cut_frac else args.optim if args.fp16: model = FP16_Module(model) optim = eval('torch.optim.' + args.optim)(model.parameters(), lr=args.lr) optim = FP16_Optimizer(optim, static_loss_scale=args.loss_scale, dynamic_loss_scale=args.dynamic_loss_scale) else: optim = eval('torch.optim.' + args.optim)(model.parameters(), lr=args.lr) if args.load_optim: optim.load_state_dict(optim_sd) # add linear learning rate scheduler if train_data is not None: if args.constant_decay: num_iters = args.constant_decay else: num_iters = args.train_iters * args.epochs init_step = -1 if args.load_optim: #TODO: this no longer makes sense given the new data loaders init_step = optim_sd['iter'] - optim_sd['skipped_iter'] train_data.batch_sampler.start_iter = (optim_sd['iter'] % len(train_data)) + 1 warmup_iter = args.warmup * num_iters if args.stlr_cut_frac is not None: LR = SlantedTriangularLR(optim, cut_frac=args.stlr_cut_frac, num_iters=num_iters) else: LR = AnnealingLR(optim, start_lr=args.lr, warmup_iter=warmup_iter, num_iters=num_iters, decay_style=args.decay_style) if args.warmup != 0: LR_Warmer = WarmupLR(optim, warmup_iter, last_iter=init_step) # wrap model for distributed training if args.world_size > 1: model = DDP(model) criterion = nn.CrossEntropyLoss(reduce=False) return model, optim, LR, LR_Warmer, criterion
def get_model_and_optim(args, train_data): if args.use_softmax: args.report_no_thresholding = True ntokens = args.data_size concat_pools = args.concat_max, args.concat_min, args.concat_mean if args.model == 'transformer': model = M.SentimentClassifier(args.model, ntokens, None, None, None, args.classifier_hidden_layers, args.classifier_dropout, None, concat_pools, args.aux_lm_loss, args) else: model = M.SentimentClassifier(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.classifier_hidden_layers, args.classifier_dropout, args.all_layers, concat_pools, args.aux_lm_loss, args) if args.cuda: model.cuda() if args.fp16: model.half() # load char embedding and recurrent encoder for featurization if args.load is not None and args.load != '': with open(args.load, 'rb') as f: sd = x = torch.load(f, 'cpu') if 'sd' in sd: sd = sd['sd'] if not args.load_finetuned: if 'lm_encoder' in sd: sd = sd['lm_encoder'] try: model.lm_encoder.load_state_dict(sd) except: # if state dict has weight normalized parameters apply and remove weight norm to model while loading sd if hasattr(model.lm_encoder, 'rnn'): apply_weight_norm(model.lm_encoder.rnn) else: apply_weight_norm(model.lm_encoder) model.lm_encoder.load_state_dict(sd) remove_weight_norm(model) else: model.load_state_dict(sd) if args.thresh_test_preds: model.set_thresholds( pd.read_csv(args.thresh_test_preds, header=None).values.squeeze(), args.double_thresh, args.dual_thresh and not args.joint_binary_train) optims = {'adam': 'Adam', 'sgd': 'SGD'} optim = eval('torch.optim.' + optims[args.optim.lower()])( model.parameters(), lr=args.lr) iters_per_epoch = len(train_data) num_iters = iters_per_epoch * args.epochs assert not (args.stlr_cut_frac and args.cos_cut_frac) if args.stlr_cut_frac is not None: LR = SlantedTriangularLR(optim, max_val=args.lr, cut_frac=args.stlr_cut_frac, num_iters=num_iters) elif args.cos_cut_frac is not None: LR = AnnealingLR(optim, start_lr=args.lr, warmup_iter=int(args.cos_cut_frac * num_iters), num_iters=num_iters, decay_style='cosine') elif args.decay_style is not None: warmup_iters = int(args.warmup_epochs * iters_per_epoch) if args.decay_epochs == -1: decay_iters = int(args.epochs * iters_per_epoch) else: decay_iters = int(args.decay_epochs * iters_per_epoch) if args.decay_style == 'constant': #TODO: implement LR = AnnealingLR(optim, start_lr=args.lr, warmup_iter=warmup_iters, num_iters=decay_iters + warmup_iters, decay_style=args.decay_style) elif args.decay_style == 'linear': #TODO: implement LR = AnnealingLR(optim, start_lr=args.lr, warmup_iter=warmup_iters, num_iters=decay_iters + warmup_iters, decay_style=args.decay_style) elif args.decay_style == 'cosine': LR = AnnealingLR(optim, start_lr=args.lr, warmup_iter=warmup_iters, num_iters=decay_iters + warmup_iters, decay_style=args.decay_style) elif args.decay_style == 'exponential': #TODO: implement LR = ConstantLR(optim, lr=args.lr) else: LR = ConstantLR(optim, lr=args.lr) else: LR = ConstantLR(optim, lr=args.lr) return model, optim, LR
rnn_model = model optim = None if args.load != '': sd = torch.load(args.load, map_location='cpu') if args.load_optim: optim_sd = torch.load(os.path.join(os.path.dirname(args.load), 'optim.pt'), map_location='cpu') rng = torch.load(os.path.join(os.path.dirname(args.load), 'rng.pt')) torch.cuda.set_rng_state(rng[0]) torch.set_rng_state(rng[1]) try: model.load_state_dict(sd) except: apply_weight_norm(model.rnn, hook_child=False) model.load_state_dict(sd) remove_weight_norm(model.rnn) if not args.no_weight_norm: apply_weight_norm(model, 'rnn', hook_child=False) # create optimizer and fp16 models if args.fp16: model = FP16_Module(model) optim = eval('torch.optim.' + args.optim)(model.parameters(), lr=args.lr) optim = FP16_Optimizer(optim, static_loss_scale=args.loss_scale, dynamic_loss_scale=args.dynamic_loss_scale) else: optim = eval('torch.optim.' + args.optim)(model.parameters(), lr=args.lr)