def __init__(self, e0: numbers.Real = 0.01, mom: numbers.Real = 0.9, skip_noisy: bool = False) -> None: super().__init__(optimizer=dy.MomentumSGDTrainer( ParamManager.global_collection(), e0, mom), skip_noisy=skip_noisy)
def _init_optimizer(self, model, **kwargs): mom = float(kwargs.get('mom', 0.0)) optim = kwargs.get('optim', 'sgd') clip = kwargs.get('clip') self.current_lr = kwargs.get('eta', kwargs.get('lr', 0.01)) if optim == 'adadelta': self.optimizer = dy.AdadeltaTrainer(model.pc) elif optim == 'adam': self.optimizer = dy.AdamTrainer(model.pc, alpha=self.current_lr, beta_1=kwargs.get('beta1', 0.9), beta_2=kwargs.get('beta2', 0.999), eps=kwargs.get('epsilon', 1e-8)) elif optim == 'rmsprop': self.optimizer = dy.RMSPropTrainer(model.pc, learning_rate=self.current_lr) else: if mom == 0 or mom is None: self.optimizer = dy.SimpleSGDTrainer( model.pc, learning_rate=self.current_lr) else: logging.info('Using mom %f', mom) self.optimizer = dy.MomentumSGDTrainer( model.pc, learning_rate=self.current_lr, mom=mom) if clip is not None: self.optimizer.set_clip_threshold(clip) self.optimizer.set_sparse_updates(False)
def get_trainer(opt, s2s): if opt.trainer == 'sgd': trainer = dy.SimpleSGDTrainer(s2s.pc, e0=opt.learning_rate, edecay=opt.learning_rate_decay) elif opt.trainer == 'clr': trainer = dy.CyclicalSGDTrainer(s2s.pc, e0_min=opt.learning_rate / 10.0, e0_max=opt.learning_rate, edecay=opt.learning_rate_decay) elif opt.trainer == 'momentum': trainer = dy.MomentumSGDTrainer(s2s.pc, e0=opt.learning_rate, edecay=opt.learning_rate_decay) elif opt.trainer == 'rmsprop': trainer = dy.RMSPropTrainer(s2s.pc, e0=opt.learning_rate, edecay=opt.learning_rate_decay) elif opt.trainer == 'adam': trainer = dy.AdamTrainer(s2s.pc, opt.learning_rate, edecay=opt.learning_rate_decay) else: print('Trainer name invalid or not provided, using SGD', file=sys.stderr) trainer = dy.SimpleSGDTrainer(s2s.pc, e0=opt.learning_rate, edecay=opt.learning_rate_decay) trainer.set_clip_threshold(opt.gradient_clip) return trainer
def __init__(self, input_dim, hidden_dim, output_dim, learning_rate=0.001): self.mdl = dy.Model() # Create model self.sgd = dy.MomentumSGDTrainer(self.mdl, learning_rate=learning_rate) # Constructing weights and biasees self.W1 = self.mdl.add_parameters((hidden_dim, input_dim)) self.hbias = self.mdl.add_parameters((hidden_dim, )) self.W2 = self.mdl.add_parameters((output_dim, hidden_dim))
def __init__(self,Cemb,character_idx_map,options): model = dy.Model() #self.trainer = dy.MomentumSGDTrainer(model,options['lr'],options['momentum'],options['edecay']) # we use Momentum SGD self.trainer = dy.MomentumSGDTrainer(model,options['lr'],options['momentum']) # we use Momentum SGD self.params = self.initParams(model,Cemb,options) self.options = options self.model = model self.character_idx_map = character_idx_map self.known_words = None
def __init__(self, data, opt): self.opt = opt self.model = dy.ParameterCollection() self.trainer = dy.MomentumSGDTrainer(self.model) self.w2i = data.w2i self.wdims = opt.embedding_size self.ldims = opt.hidden_size self.attsize = opt.attention_size self.ext_embeddings = data.ext_embeddings # Model Parameters self.wlookup = self.model.add_lookup_parameters( (len(self.w2i), self.wdims)) self.__load_external_embeddings() if self.opt.encoder_dir == "single": if self.opt.encoder_type == "lstm": self.sentence_rnn = [ dy.VanillaLSTMBuilder(1, self.wdims, self.ldims, self.model) ] elif self.opt.encoder_type == "gru": self.sentence_rnn = [ dy.GRUBuilder(1, self.wdims, self.ldims, self.model) ] self.attention_w = self.model.add_parameters( (self.attsize, self.ldims)) self.attention_b = self.model.add_parameters(self.attsize) self.att_context = self.model.add_parameters(self.attsize) self.mlp_w = self.model.add_parameters( (1, self.ldims + 2 * self.ldims)) self.mlp_b = self.model.add_parameters(1) elif self.opt.encoder_dir == "bidirectional": if self.opt.encoder_type == "lstm": self.sentence_rnn = [ dy.VanillaLSTMBuilder(1, self.wdims, self.ldims, self.model), dy.VanillaLSTMBuilder(1, self.wdims, self.ldims, self.model), ] elif self.opt.encoder_type == "gru": self.sentence_rnn = [ dy.GRUBuilder(1, self.wdims, self.ldims, self.model), dy.GRUBuilder(1, self.wdims, self.ldims, self.model), ] self.attention_w = self.model.add_parameters( (self.attsize, 2 * self.ldims)) self.attention_b = self.model.add_parameters(self.attsize) self.att_context = self.model.add_parameters(self.attsize) self.mlp_w = self.model.add_parameters( (1, 2 * self.ldims + 4 * self.ldims)) self.mlp_b = self.model.add_parameters(1)
def __init__(self, Cemb, character_idx_map, options): model = dy.Model() # Initialize ParameterCollection # pre_gt = lr/(1+edecy ** t) # gt = pre_gt + momentum * gt-1 # edecay is not avaiable after dynet 1.0 self.trainer = dy.MomentumSGDTrainer( model, options['lr'], options['momentum']) # we use Momentum SGD self.params = self.initParams(model, Cemb, options) # Init parameters self.options = options self.model = model self.character_idx_map = character_idx_map self.known_words = None
def __init__(self, params, vocab, label2tag, pretrained_embeddings=None): """ :param params: :param vocab: :param label2tag: :param pretrained_embeddings: """ self.dim_w = params.dim_w self.win = params.win self.vocab = vocab self.n_words = len(self.vocab) self.dim_asp = params.dim_asp self.dim_y_asp = params.n_asp_tags self.n_steps = params.n_steps self.asp_label2tag = label2tag self.dropout_asp = params.dropout_asp self.dropout = params.dropout self.ds_name = params.ds_name self.model_name = params.model_name self.attention_type = params.attention_type self.pc = dy.ParameterCollection() self.Emb = WDEmb(pc=self.pc, n_words=self.n_words, dim_w=self.dim_w, pretrained_embeddings=pretrained_embeddings) self.DEP_RecNN = DTreeBuilder(pc=self.pc, n_in=self.win * self.dim_w, n_out=self.dim_asp, dropout_rate=self.dropout_asp) self.ASP_RNN = dy.LSTMBuilder(1, self.win * self.dim_w, self.dim_asp, self.pc) self.BiAttention_F=BiAttention(pc=self.pc, n_in=self.dim_asp, n_out=self.dim_asp, dropout_rate=self.dropout_asp) self.BiAttention_B=BiAttention(pc=self.pc, n_in=self.dim_asp, n_out=self.dim_asp, dropout_rate=self.dropout_asp) self.BiAttention_T=BiAttention(pc=self.pc, n_in=self.dim_asp, n_out=self.dim_asp, dropout_rate=self.dropout_asp) self.MultiWeightLayer=MultiWeightLayer(pc=self.pc, n_in=self.dim_asp, n_out=self.dim_asp, dropout_rate=self.dropout_asp) self.ASP_FC = Linear(pc=self.pc, n_in=self.dim_asp, n_out=self.dim_y_asp) self.layers = [self.ASP_FC,self.DEP_RecNN,self.BiAttention_F,self.BiAttention_B,self.BiAttention_T,self.MultiWeightLayer] if params.optimizer == 'sgd': self.optimizer = dy.SimpleSGDTrainer(self.pc, params.sgd_lr) elif params.optimizer == 'momentum': self.optimizer = dy.MomentumSGDTrainer(self.pc, 0.01, 0.9) elif params.optimizer == 'adam': self.optimizer = dy.AdamTrainer(self.pc, 0.001, 0.9, 0.9) elif params.optimizer == 'adagrad': self.optimizer = dy.AdagradTrainer(self.pc) elif params.optimizer == 'adadelta': self.optimizer = dy.AdadeltaTrainer(self.pc) else: raise Exception("Invalid optimizer!!")
def set_trainer(self, optimization): if optimization == 'MomentumSGD': self.trainer = dy.MomentumSGDTrainer( self.model, learning_rate=self.hp.learning_rate) if optimization == 'CyclicalSGD': self.trainer = dy.CyclicalSGDTrainer( self.model, learning_rate_max=self.hp.learning_rate_max, learning_rate_min=self.hp.learning_rate_min) if optimization == 'Adam': self.trainer = dy.AdamTrainer(self.model) if optimization == 'RMSProp': self.trainer = dy.RMSPropTrainer(self.model) else: # 'SimpleSGD' self.trainer = dy.SimpleSGDTrainer( self.model, learning_rate=self.hp.learning_rate)
def __init__(self, model, optim='sgd', clip=5, mom=0.9, **kwargs): super(ClassifyTrainerDynet, self).__init__() self.model = model eta = kwargs.get('eta', kwargs.get('lr', 0.01)) print("Using eta [{:.4f}]".format(eta)) print("Using optim [{}]".format(optim)) self.labels = model.labels if optim == 'adadelta': self.optimizer = dy.AdadeltaTrainer(model.pc) elif optim == 'adam': self.optimizer = dy.AdamTrainer(model.pc) elif optim == 'rmsprop': self.optimizer = dy.RMSPropTrainer(model.pc, learning_rate=eta) else: print("using mom {:.3f}".format(mom)) self.optimizer = dy.MomentumSGDTrainer(model.pc, learning_rate=eta, mom=mom) self.optimizer.set_clip_threshold(clip)
def __init__(self, input_dim, hidden_dim, output_dim, learning_rate=0.001): self._model = dy.ParameterCollection() self._input_dim = input_dim self._hidden_dim = hidden_dim self._output_dim = output_dim self._rnn = dy.SimpleRNNBuilder(self.LAYERS, self._input_dim, self._hidden_dim, self._model) # self._rnn.disable_dropout() self._W = self._model.add_parameters( (self._output_dim, self._hidden_dim), init=dy.NormalInitializer()) self._learning_rate = learning_rate self._trainer = dy.MomentumSGDTrainer( self._model, learning_rate=self._learning_rate) self._l2_param = 0.0006 # self._l2_param = 0.0 self._init_layers()
def optimizer(model, optim='sgd', eta=0.01, clip=None, mom=0.9, **kwargs): if 'lr' in kwargs: eta = kwargs['lr'] print('Using eta [{:.4f}]'.format(eta)) print('Using optim [{}]'.format(optim)) if optim == 'adadelta': opt = dy.AdadeltaTrainer(model.pc) elif optim == 'adam': opt = dy.AdamTrainer(model.pc) elif optim == 'rmsprop': opt = dy.RMSPropTrainer(model.pc, learning_rate=eta) else: if mom == 0 or mom is None: opt = dy.SimpleSGDTrainer(model.pc, learning_rate=eta) else: print('Using mom {:.3f}'.format(mom)) opt = dy.MomentumSGDTrainer(model.pc, learning_rate=eta, mom=mom) if clip is not None: opt.set_clip_threshold(clip) opt.set_sparse_updates(False) return opt
def train_model_with_config(): import research_toolbox.tb_logging as tb_lg if cfg["optimizer_type"] == "sgd": trainer = dy.SimpleSGDTrainer(m, cfg["step_size_start"]) elif cfg["optimizer_type"] == "adam": trainer = dy.AdamTrainer(m, cfg["step_size_start"]) elif cfg["optimizer_type"] == "sgd_mom": trainer = dy.MomentumSGDTrainer(m, cfg["step_size_start"]) else: raise ValueError trainer.set_sparse_updates(0) # restarting from a checkpoint if it exists. # optimizer state is not kept. ckpt_filepath = cfg["out_folder"] + "/checkpoint.json" if tb_fs.file_exists(ckpt_filepath): log_d = tb_io.read_jsonfile(ckpt_filepath) current_epoch = len(log_d["dev_acc"]) best_dev_acc = np.max(log_d["dev_acc"]) m.populate(cfg["out_folder"] + '/model.ckpt') else: current_epoch = 0 best_dev_acc = 0.0 log_d = { 'dev_acc': [], 'avg_loss': [], 'train_tks/sec': [], 'eval_tks/sec': [], 'secs_per_epoch': [], "lr": [] } if cfg["debug"] or cfg["compute_train_acc"]: log_d["train_acc"] = [] if cfg["loss_type"] == "log_neighbors": loss_fn = loss_log_neighbors elif cfg["loss_type"] == "log_beam": loss_fn = loss_log_beam elif cfg["loss_type"] == "cost_sensitive_margin_last": loss_fn = loss_cost_sensitive_margin_last elif cfg["loss_type"] == "margin_last": loss_fn = loss_margin_last elif cfg["loss_type"] == "perceptron_first": loss_fn = loss_perceptron_first elif cfg["loss_type"] == "perceptron_last": loss_fn = loss_perceptron_last elif cfg["loss_type"] == "upper_bound": loss_fn = loss_upper_bound else: raise ValueError cfg_accuracy = lambda data: beam_accuracy(data, cfg["beam_size"]) cfg_train_graph = lambda e: train_beam_graph(e, cfg["beam_size"], cfg[ "traj_type"], loss_fn) for epoch in range(current_epoch, cfg["num_epochs"]): if cfg["step_size_schedule_type"] == 'fixed': lr = cfg["step_size_start"] elif cfg["step_size_schedule_type"] == 'cosine': lr = cosine_get_lr(cfg["step_size_start"], cfg["step_size_end"], cfg["num_epochs"], epoch) else: raise ValueError log_d['lr'].append(lr) trainer.learning_rate = lr acc_loss = 0.0 random.shuffle(train_data) epoch_timer = tb_lg.TimeTracker() train_timer = tb_lg.TimeTracker() for i, e in enumerate(train_data): if i % cfg["print_every_num_examples"] == 0 and i > 0: print "Epoch %d - Example %d/%d" % (epoch, i, len(train_data)) loss = cfg_train_graph(e) acc_loss += loss.value() loss.backward() trainer.update() log_d["avg_loss"].append(acc_loss / len(train_data)) log_d["train_tks/sec"].append(num_train_tokens / train_timer.time_since_start()) eval_timer = tb_lg.TimeTracker() # log_d['train_acc'].append(accuracy(train_data)) log_d['dev_acc'].append(cfg_accuracy(dev_data)) # log_d['test_acc'].append(accuracy(test_data)) log_d['eval_tks/sec'].append(( #len(train_data) + num_dev_tokens # + num_test_tokens ) / eval_timer.time_since_start()) log_d["secs_per_epoch"].append(epoch_timer.time_since_start()) if cfg["debug"] or cfg["compute_train_acc"]: train_acc = cfg_accuracy(train_data) print "train_acc: ", train_acc log_d["train_acc"].append(train_acc) pprint({k: vs[-1] for k, vs in log_d.iteritems()}) if best_dev_acc < log_d["dev_acc"][-1]: best_dev_acc = log_d["dev_acc"][-1] m.save(cfg["out_folder"] + '/best_model.ckpt') tb_io.write_jsonfile(log_d, cfg["out_folder"] + "/checkpoint.json") m.save(cfg["out_folder"] + '/model.ckpt') results_filepath = cfg["out_folder"] + "/results.json" if not tb_fs.file_exists(results_filepath): m.populate(cfg["out_folder"] + '/best_model.ckpt') log_d['test_acc'] = cfg_accuracy(test_data) tb_io.write_jsonfile(log_d, cfg["out_folder"] + "/results.json")
model = PyTorchModel() optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY) rescale_lr = lambda epoch: 1 / (1 + LEARNING_DECAY_RATE * epoch) scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=rescale_lr) else: model = DyNetModel() optimizer = None if args.opt == 'sgd': optimizer = dy.SimpleSGDTrainer(model.model, learning_rate=LEARNING_RATE) elif args.opt == 'mom': optimizer = dy.MomentumSGDTrainer(model.model, learning_rate=LEARNING_RATE, mom=MOMENTUM) elif args.opt == 'csgd': pass ### optimizer = dy.CyclicalSGDTrainer(model.model, learning_rate=LEARNING_RATE, mom=MOMENTUM) lrmin, lrmax, step, gamma (decay) optimizer.set_clip_threshold(args.clip) prev_best = None if args.train: step = 0 for epoch in range(EPOCHS): random.shuffle(train) # Update learning rate if PYTORCH: scheduler.step()
# the BiLSTM for all the chars, take input of embed dim, and output of the hidden_dim minus the embed_dim because we will concatenate # with output from a separate bilstm of just the word bilstm = BILSTMTransducer(BILSTM_LAYERS, EMBED_DIM, HIDDEN_DIM, model) # a prev-pos lstm. The mlp's will take this as input as well prev_pos_lstm = dy.LSTMBuilder(BILSTM_LAYERS, EMBED_DIM, EMBED_DIM, model) # now the class mlp, it will take input of 2*HIDDEN_DIM (A concatenate of the before and after the word) + EMBED_DIM from the prev-pos # output of 2, unknown\talmud class_mlp = MLP(model, "classmlp", 2 * HIDDEN_DIM + EMBED_DIM, HIDDEN_DIM, 2) # pos mlp, same input but output the size of pos_vocab pos_mlp = MLP(model, 'posmlp', 2 * HIDDEN_DIM + EMBED_DIM, HIDDEN_DIM, pos_vocab.size()) # the trainer trainer = dy.MomentumSGDTrainer(model) print "LOADING" # if we are loading in a model if filename_to_load: model.load(filename_to_load) print "DONE" if train_test: run_network_on_validation(START_EPOCH - 1) pos_conf_matrix.clear() # train! for epoch in range(START_EPOCH, 20): last_loss, last_pos_prec, last_class_prec, last_rough_pos_prec = 0.0, 0.0, 0.0, 0.0 total_loss, total_pos_prec, total_class_prec, total_rough_pos_prec = 0.0, 0.0, 0.0, 0.0
def train_model(model, encoder, decoder, params, train_inputs, train_outputs, dev_inputs, dev_outputs, y2int, int2y, epochs, optimization, results_file_path, plot, batch_size, eval_after): print 'training...' np.random.seed(17) random.seed(17) # sort training sentences by length in descending order train_data = zip(train_inputs, train_outputs) train_data.sort(key=lambda t: -len(t[0])) train_order = [ x * batch_size for x in range(len(train_data) / batch_size + 1) ] # sort dev sentences by length in descending order dev_batch_size = 1 dev_data = zip(dev_inputs, dev_outputs) dev_data.sort(key=lambda t: -len(t[0])) dev_order = [ x * dev_batch_size for x in range(len(dev_data) / dev_batch_size + 1) ] if optimization == 'ADAM': trainer = dn.AdamTrainer( model ) # lam=REGULARIZATION, alpha=LEARNING_RATE, beta_1=0.9, beta_2=0.999, eps=1e-8) elif optimization == 'MOMENTUM': trainer = dn.MomentumSGDTrainer(model) elif optimization == 'SGD': trainer = dn.SimpleSGDTrainer(model) elif optimization == 'ADAGRAD': trainer = dn.AdagradTrainer(model) elif optimization == 'ADADELTA': trainer = dn.AdadeltaTrainer(model) else: trainer = dn.SimpleSGDTrainer(model) trainer.set_clip_threshold(float(arguments['--grad-clip'])) seen_examples_count = 0 total_loss = 0 best_dev_epoch = 0 best_train_epoch = 0 patience = 0 train_len = len(train_outputs) dev_len = len(dev_inputs) avg_train_loss = -1 train_loss_patience = 0 train_loss_patience_threshold = 99999999 max_patience = int(arguments['--max-patience']) log_path = results_file_path + '_log.txt' start_epoch, checkpoints_x, train_loss_y, dev_loss_y, dev_accuracy_y = read_from_log( log_path) if len(train_loss_y) > 0: total_batches = checkpoints_x[-1] best_avg_train_loss = max(train_loss_y) best_dev_accuracy = max(dev_accuracy_y) best_dev_loss = max(dev_loss_y) else: total_batches = 0 best_avg_train_loss = 999999 best_dev_loss = 999999 best_dev_accuracy = 0 # progress bar init # noinspection PyArgumentList widgets = [progressbar.Bar('>'), ' ', progressbar.ETA()] train_progress_bar = progressbar.ProgressBar(widgets=widgets, maxval=epochs).start() for e in xrange(start_epoch, epochs): # shuffle the batch start indices in each epoch random.shuffle(train_order) batches_per_epoch = len(train_order) start = time.time() # go through batches for i, batch_start_index in enumerate(train_order, start=1): total_batches += 1 # get batch examples batch_inputs = [ x[0] for x in train_data[batch_start_index:batch_start_index + batch_size] ] batch_outputs = [ x[1] for x in train_data[batch_start_index:batch_start_index + batch_size] ] actual_batch_size = len(batch_inputs) # skip empty batches if actual_batch_size == 0 or len(batch_inputs[0]) == 0: continue # compute batch loss loss = compute_batch_loss(encoder, decoder, batch_inputs, batch_outputs, y2int) # forward pass total_loss += loss.scalar_value() loss.backward() # update parameters trainer.update() seen_examples_count += actual_batch_size # avg loss per sample avg_train_loss = total_loss / float(i * batch_size + e * train_len) # start patience counts only after 20 batches if avg_train_loss < best_avg_train_loss and total_batches > 20: best_avg_train_loss = avg_train_loss train_loss_patience = 0 else: train_loss_patience += 1 if train_loss_patience > train_loss_patience_threshold: print 'train loss patience exceeded: {}'.format( train_loss_patience) return model, params, e, best_train_epoch if total_batches % 100 == 0 and total_batches > 0: print 'epoch {}: {} batches out of {} ({} examples out of {}) total: {} batches, {} examples. avg \ loss per example: {}'.format(e, i, batches_per_epoch, i * batch_size, train_len, total_batches, total_batches * batch_size, avg_train_loss) # print sentences per second end = time.time() elapsed_seconds = end - start print '{} sentences per second'.format(seen_examples_count / elapsed_seconds) seen_examples_count = 0 start = time.time() # checkpoint if total_batches % eval_after == 0: print 'starting checkpoint evaluation' dev_bleu, dev_loss = checkpoint_eval( encoder, decoder, params, dev_batch_size, dev_data, dev_inputs, dev_len, dev_order, dev_outputs, int2y, y2int, results_file_path=results_file_path) log_to_file(log_path, e, total_batches, avg_train_loss, dev_loss, dev_bleu) save_model(model, results_file_path, total_batches, models_to_save=int(arguments['--models-to-save'])) if dev_bleu >= best_dev_accuracy: best_dev_accuracy = dev_bleu best_dev_epoch = e # save best model to disk save_best_model(model, results_file_path) print 'saved new best model' patience = 0 else: patience += 1 if dev_loss < best_dev_loss: best_dev_loss = dev_loss print 'epoch: {0} train loss: {1:.4f} dev loss: {2:.4f} dev bleu: {3:.4f} \ best dev bleu {4:.4f} (epoch {5}) patience = {6}'.format( e, avg_train_loss, dev_loss, dev_bleu, best_dev_accuracy, best_dev_epoch, patience) if patience == max_patience: print 'out of patience after {0} checkpoints'.format( str(e)) train_progress_bar.finish() if plot: plt.cla() print 'checkpoint patience exceeded' return model, params, e, best_train_epoch # plotting results from checkpoint evaluation if plot: train_loss_y.append(avg_train_loss) checkpoints_x.append(total_batches) dev_accuracy_y.append(dev_bleu) dev_loss_y.append(dev_loss) y_vals = [('train_loss', train_loss_y), ('dev loss', dev_loss_y), ('dev_bleu', dev_accuracy_y)] common.plot_to_file(y_vals, x_name='total batches', x_vals=checkpoints_x, file_path=results_file_path + '_learning_curve.png') # update progress bar after completing epoch train_progress_bar.update(e) # update progress bar after completing training train_progress_bar.finish() if plot: # clear plot when done plt.cla() print 'finished training. average loss: {} best epoch on dev: {} best epoch on train: {}'.format( str(avg_train_loss), best_dev_epoch, best_train_epoch) return model, params, e, best_train_epoch
def train(self, training_instances, dev_instances=None, num_epochs=60, batch_size=20, learning_rate=0.01, learning_rate_decay=0.9, dropout=0.2, clip_norm=None, save_path=None, logger=None, debug=False): if dev_instances is None: dev_instances = training_instances[:int(len(training_instances) * 0.1)] if type(training_instances) is TSVCorpus: training_instances = training_instances.sentences if type(dev_instances) is TSVCorpus: dev_instances = dev_instances.sentences if debug: training_instances = training_instances[:200] dev_instances = dev_instances[:100] num_epochs = 2 trainer = dy.MomentumSGDTrainer(self.model, learning_rate, 0.9) if clip_norm > 0: trainer.set_clip_threshold(clip_norm) if logger: # logger.info("Training Algorithm: {}".format(type(trainer))) logger.info("# training instances: {}".format(len(training_instances))) logger.info("# dev instances: {}".format(len(dev_instances))) training_total_tokens = 0 best_f1 = 0. for epoch in range(num_epochs): if logger: logger.info("Epoch {} out of {}".format(epoch + 1, num_epochs)) random.shuffle(training_instances) train_loss = 0.0 train_total_instance = 0 # size of trained instances if dropout > 0: self.set_dropout(dropout) nbatches = (len(training_instances) + batch_size - 1) // batch_size bar = utils.Progbar(target=nbatches) for batch_id, batch in enumerate(utils.minibatches(training_instances, batch_size)): for instance in batch: train_total_instance += 1 loss_expr = self.neg_log_loss(instance.words, instance.tags) # Forward pass loss = loss_expr.scalar_value() # Backward pass loss_expr.backward() # Bail if loss is NaN if math.isnan(loss): assert False, "NaN occured" train_loss += loss training_total_tokens += len(instance.words) trainer.update() if batch_size == 1 and batch_id % 10 != 0 and batch_id + 1 != train_total_instance: # online learning, don't print too often continue bar.update(batch_id + 1, exact=[("train loss", train_loss / train_total_instance)]) trainer.learning_rate *= learning_rate_decay f1 = self.evaluate(dev_instances)[-1] if f1 > best_f1: best_f1 = f1 if logger: logger.info('%.2f%% - new best dev score' % f1) if save_path: self.save(save_path) else: if logger: logger.info('%.2f%%' % f1)
parser.add_argument('--evec', type=int) group.add_argument('--save-model', dest='save_model') group.add_argument('--load-model', dest='load_model') args = parser.parse_args() np.random.seed(args.seed) random.seed(args.seed) meta = Meta() if args.dev: dev = read(args.dev) if not args.load_model: train = read(args.train) wvm = Word2Vec.load_word2vec_format(args.embd, binary=args.evec) meta.w_dim = wvm.syn0.shape[1] meta.n_words = wvm.syn0.shape[0] + meta.add_words get_cc(train) meta.w2i = {} for w in wvm.vocab: meta.w2i[w] = wvm.vocab[w].index + meta.add_words if args.save_model: pickle.dump(meta, open('%s.meta' % args.save_model, 'wb')) if args.load_model: tagger = Tagger(model=args.load_model) eval(dev) else: tagger = Tagger(meta=meta) trainer = dy.MomentumSGDTrainer(tagger.model) train_tagger(train)
def __init__(self, exp_global=Ref(Path("exp_global")), e0=0.01, mom=0.9): self.optimizer = dy.MomentumSGDTrainer( exp_global.dynet_param_collection.param_col, e0, mom)
def __init__(self, word_count, tag_count, word_dims, tag_dims, lstm_units, hidden_units, struct_out, label_out, droprate=0, struct_spans=4, label_spans=3, optimizer=1): self.word_count = word_count self.tag_count = tag_count self.word_dims = word_dims self.tag_dims = tag_dims self.lstm_units = lstm_units self.hidden_units = hidden_units self.struct_out = struct_out self.label_out = label_out self.droprate = droprate self.model = dynet.Model() if optimizer == 1: self.trainer = dynet.SimpleSGDTrainer(self.model) elif optimizer == 2: self.trainer = dynet.MomentumSGDTrainer(self.model) elif optimizer == 3: self.trainer = dynet.AdagradTrainer(self.model, learning_rate=0.01, eps=0.001) elif optimizer == 4: self.trainer = dynet.RMSPropTrainer(self.model) elif optimizer == 5: self.trainer = dynet.AdamTrainer(self.model) random.seed(1) self.activation = dynet.rectify self.word_embed = self.model.add_lookup_parameters( (word_count, word_dims), ) self.tag_embed = self.model.add_lookup_parameters( (tag_count, tag_dims), ) self.fwd_lstm1 = LSTM(word_dims + tag_dims, lstm_units, self.model) self.back_lstm1 = LSTM(word_dims + tag_dims, lstm_units, self.model) self.fwd_lstm2 = LSTM(2 * lstm_units, lstm_units, self.model) self.back_lstm2 = LSTM(2 * lstm_units, lstm_units, self.model) self.struct_hidden_W = self.model.add_parameters( (hidden_units, 4 * struct_spans * lstm_units), dynet.UniformInitializer(0.01), ) self.struct_hidden_b = self.model.add_parameters( (hidden_units, ), dynet.ConstInitializer(0), ) self.struct_output_W = self.model.add_parameters( (struct_out, hidden_units), dynet.ConstInitializer(0), ) self.struct_output_b = self.model.add_parameters( (struct_out, ), dynet.ConstInitializer(0), ) self.label_hidden_W = self.model.add_parameters( (hidden_units, 4 * label_spans * lstm_units), dynet.UniformInitializer(0.01), ) self.label_hidden_b = self.model.add_parameters( (hidden_units, ), dynet.ConstInitializer(0), ) self.label_output_W = self.model.add_parameters( (label_out, hidden_units), dynet.ConstInitializer(0), ) self.label_output_b = self.model.add_parameters( (label_out, ), dynet.ConstInitializer(0), )
out_size = 2 with open(input_vocab, 'r') as f: vocab = f.readlines() vocab = map(lambda s: s.strip(), vocab) vocab_size = len(vocab) adv_net = AdvNN(hid_size, hid_size, out_size, hid_size, adv_hid_size, out_size, num_adv, vocab_size, dropout, lstm_size, adv_depth, rnn_dropout=rnn_dropout, rnn_type=rnn_type) trainer = dy.MomentumSGDTrainer(adv_net._model) batch = 32 if ro == str(-1): logger.debug('1 task') train_task(adv_net, x_train, x_test, trainer, num_epoch, batch, task_type, logger) else: logger.debug('2 tasks') train(adv_net, x_train, x_test, trainer, num_epoch, batch, vec_dropout, logger)
def train_model(model, char_lookup, feat_lookup, R, bias, encoder_frnn, encoder_rrnn, decoder_rnn, train_lemmas, train_feat_dicts, train_words, dev_lemmas, dev_feat_dicts, dev_words, alphabet_index, inverse_alphabet_index, epochs, optimization, results_file_path, train_aligned_pairs, dev_aligned_pairs, feat_index, feature_types, plot): print 'training...' np.random.seed(17) random.seed(17) if optimization == 'ADAM': trainer = pc.AdamTrainer(model, lam=REGULARIZATION, alpha=LEARNING_RATE, beta_1=0.9, beta_2=0.999, eps=1e-8) elif optimization == 'MOMENTUM': trainer = pc.MomentumSGDTrainer(model) elif optimization == 'SGD': trainer = pc.SimpleSGDTrainer(model) elif optimization == 'ADAGRAD': trainer = pc.AdagradTrainer(model) elif optimization == 'ADADELTA': trainer = pc.AdadeltaTrainer(model) else: trainer = pc.SimpleSGDTrainer(model) total_loss = 0 best_avg_dev_loss = 999 best_dev_accuracy = -1 best_train_accuracy = -1 patience = 0 train_len = len(train_words) sanity_set_size = 100 epochs_x = [] train_loss_y = [] dev_loss_y = [] train_accuracy_y = [] dev_accuracy_y = [] e = -1 # progress bar init widgets = [progressbar.Bar('>'), ' ', progressbar.ETA()] train_progress_bar = progressbar.ProgressBar(widgets=widgets, maxval=epochs).start() avg_loss = -1 for e in xrange(epochs): # randomize the training set indices = range(train_len) random.shuffle(indices) train_set = zip(train_lemmas, train_feat_dicts, train_words, train_aligned_pairs) train_set = [train_set[i] for i in indices] # compute loss for each example and update for i, example in enumerate(train_set): lemma, feats, word, alignment = example loss = one_word_loss(model, char_lookup, feat_lookup, R, bias, encoder_frnn, encoder_rrnn, decoder_rnn, lemma, feats, word, alphabet_index, alignment, feat_index, feature_types) loss_value = loss.value() total_loss += loss_value loss.backward() trainer.update() if i > 0: avg_loss = total_loss / float(i + e * train_len) else: avg_loss = total_loss if EARLY_STOPPING: # get train accuracy print 'evaluating on train...' train_predictions = predict_sequences(model, char_lookup, feat_lookup, R, bias, encoder_frnn, encoder_rrnn, decoder_rnn, alphabet_index, inverse_alphabet_index, train_lemmas[:sanity_set_size], train_feat_dicts[:sanity_set_size], feat_index, feature_types) train_accuracy = evaluate_model(train_predictions, train_lemmas[:sanity_set_size], train_feat_dicts[:sanity_set_size], train_words[:sanity_set_size], feature_types, print_results=False)[1] if train_accuracy > best_train_accuracy: best_train_accuracy = train_accuracy dev_accuracy = 0 avg_dev_loss = 0 if len(dev_lemmas) > 0: # get dev accuracy dev_predictions = predict_sequences(model, char_lookup, feat_lookup, R, bias, encoder_frnn, encoder_rrnn, decoder_rnn, alphabet_index, inverse_alphabet_index, dev_lemmas, dev_feat_dicts, feat_index, feature_types) print 'evaluating on dev...' # get dev accuracy dev_accuracy = evaluate_model(dev_predictions, dev_lemmas, dev_feat_dicts, dev_words, feature_types, print_results=True)[1] if dev_accuracy > best_dev_accuracy: best_dev_accuracy = dev_accuracy # save best model to disk save_pycnn_model(model, results_file_path) print 'saved new best model' patience = 0 else: patience += 1 # found "perfect" model if dev_accuracy == 1: train_progress_bar.finish() if plot: plt.cla() return model, e # get dev loss total_dev_loss = 0 for i in xrange(len(dev_lemmas)): total_dev_loss += one_word_loss(model, char_lookup, feat_lookup, R, bias, encoder_frnn, encoder_rrnn, decoder_rnn, dev_lemmas[i], dev_feat_dicts[i], dev_words[i], alphabet_index, dev_aligned_pairs[i], feat_index, feature_types).value() avg_dev_loss = total_dev_loss / float(len(dev_lemmas)) if avg_dev_loss < best_avg_dev_loss: best_avg_dev_loss = avg_dev_loss print 'epoch: {0} train loss: {1:.4f} dev loss: {2:.4f} dev accuracy: {3:.4f} train accuracy = {4:.4f} \ best dev accuracy {5:.4f} best train accuracy: {6:.4f} patience = {7}'.format(e, avg_loss, avg_dev_loss, dev_accuracy, train_accuracy, best_dev_accuracy, best_train_accuracy, patience) log_to_file(results_file_path + '_log.txt', e, avg_loss, train_accuracy, dev_accuracy) if patience == MAX_PATIENCE: print 'out of patience after {0} epochs'.format(str(e)) # TODO: would like to return best model but pycnn has a bug with save and load. Maybe copy via code? # return best_model[0] train_progress_bar.finish() if plot: plt.cla() return model, e else: # if no dev set is present, optimize on train set print 'no dev set for early stopping, running all epochs until perfectly fitting or patience was \ reached on the train set' if train_accuracy > best_train_accuracy: best_train_accuracy = train_accuracy # save best model to disk save_pycnn_model(model, results_file_path) print 'saved new best model' patience = 0 else: patience += 1 print 'epoch: {0} train loss: {1:.4f} train accuracy = {2:.4f} best train accuracy: {3:.4f} \ patience = {4}'.format(e, avg_loss, train_accuracy, best_train_accuracy, patience) # found "perfect" model on train set or patience has reached if train_accuracy == 1 or patience == MAX_PATIENCE: train_progress_bar.finish() if plot: plt.cla() return model, e # update lists for plotting train_accuracy_y.append(train_accuracy) epochs_x.append(e) train_loss_y.append(avg_loss) dev_loss_y.append(avg_dev_loss) dev_accuracy_y.append(dev_accuracy) # finished epoch train_progress_bar.update(e) if plot: with plt.style.context('fivethirtyeight'): p1, = plt.plot(epochs_x, dev_loss_y, label='dev loss') p2, = plt.plot(epochs_x, train_loss_y, label='train loss') p3, = plt.plot(epochs_x, dev_accuracy_y, label='dev acc.') p4, = plt.plot(epochs_x, train_accuracy_y, label='train acc.') plt.legend(loc='upper left', handles=[p1, p2, p3, p4]) plt.savefig(results_file_path + '.png') train_progress_bar.finish() if plot: plt.cla() print 'finished training. average loss: ' + str(avg_loss) return model, e
def __init__(self, params, vocab, embeddings, char_embeddings): """ :param params: :param vocab: :param embeddings: :param char_embeddings: """ self.params = params self.name = 'lstm_cascade' self.dim_char = params.dim_char self.dim_w = params.dim_w self.dim_char_h = params.dim_char_h self.dim_ote_h = params.dim_ote_h self.dim_ts_h = params.dim_ts_h self.input_win = params.input_win self.ds_name = params.ds_name # tag vocabulary of opinion target extraction and targeted sentiment self.ote_tag_vocab = params.ote_tag_vocab self.ts_tag_vocab = params.ts_tag_vocab self.dim_ote_y = len(self.ote_tag_vocab) self.dim_ts_y = len(self.ts_tag_vocab) self.n_epoch = params.n_epoch self.dropout_rate = params.dropout self.tagging_schema = params.tagging_schema self.clip_grad = params.clip_grad self.use_char = params.use_char # name of word embeddings self.emb_name = params.emb_name self.embeddings = embeddings self.vocab = vocab # character vocabulary self.char_vocab = params.char_vocab #self.td_proportions = params.td_proportions self.epsilon = params.epsilon #self.tc_proportions = params.tc_proportions self.pc = dy.ParameterCollection() if self.use_char: self.char_emb = CharEmb(pc=self.pc, n_chars=len(self.char_vocab), dim_char=self.dim_char, pretrained_embeddings=char_embeddings) self.lstm_char = dy.LSTMBuilder(1, self.dim_char, self.dim_char_h, self.pc) dim_input = self.input_win * self.dim_w + 2 * self.dim_char_h else: dim_input = self.input_win * self.dim_w # word embedding layer self.emb = WDEmb(pc=self.pc, n_words=len(vocab), dim_w=self.dim_w, pretrained_embeddings=embeddings) # lstm layers self.lstm_ote = dy.LSTMBuilder(1, dim_input, self.dim_ote_h, self.pc) self.lstm_ts = dy.LSTMBuilder(1, 2 * self.dim_ote_h, self.dim_ts_h, self.pc) # fully connected layer self.fc_ote = Linear(pc=self.pc, n_in=2 * self.dim_ote_h, n_out=self.dim_ote_y) self.fc_ts = Linear(pc=self.pc, n_in=2 * self.dim_ts_h, n_out=self.dim_ts_y) assert self.tagging_schema == 'BIEOS' transition_path = { 'B': ['B-POS', 'B-NEG', 'B-NEU'], 'I': ['I-POS', 'I-NEG', 'I-NEU'], 'E': ['E-POS', 'E-NEG', 'E-NEU'], 'S': ['S-POS', 'S-NEG', 'S-NEU'], 'O': ['O'] } self.transition_scores = np.zeros((self.dim_ote_y, self.dim_ts_y)) for t in transition_path: next_tags = transition_path[t] n_next_tag = len(next_tags) ote_id = self.ote_tag_vocab[t] for nt in next_tags: ts_id = self.ts_tag_vocab[nt] self.transition_scores[ote_id][ts_id] = 1.0 / n_next_tag print(self.transition_scores) self.transition_scores = np.array(self.transition_scores, dtype='float32').transpose() # opinion target-opinion words co-occurrence modeling self.stm_lm = Linear(pc=self.pc, n_in=2 * self.dim_ote_h, n_out=2 * self.dim_ote_h, nonlinear='tanh') # fully connected layer for opinion-enhanced indicator prediction task self.fc_stm = Linear(pc=self.pc, n_in=2 * self.dim_ote_h, n_out=2) # gate for maintaining sentiment consistency self.W_gate = self.pc.add_parameters( (2 * self.dim_ote_h, 2 * self.dim_ote_h), init=dy.UniformInitializer(0.2)) # determine the optimizer if params.optimizer == 'sgd': self.optimizer = dy.SimpleSGDTrainer(self.pc, params.sgd_lr) elif params.optimizer == 'adam': self.optimizer = dy.AdamTrainer(self.pc, 0.001, 0.9, 0.9) elif params.optimizer == 'adadelta': self.optimizer = dy.AdadeltaTrainer(self.pc) elif params.optimizer == 'momentum': self.optimizer = dy.MomentumSGDTrainer(self.pc, 0.01, 0.9) else: raise Exception("Unsupported optimizer type: %s" % params.optimizer)
def train(opt): # Load data ========================================================= if opt.verbose: print('Reading corpora') # Read vocabs if opt.dic_src: widss, ids2ws = data.load_dic(opt.dic_src) else: widss, ids2ws = data.read_dic(opt.train_src, max_size=opt.src_vocab_size) data.save_dic(opt.exp_name + '_src_dic.txt', widss) if opt.dic_dst: widst, ids2wt = data.load_dic(opt.dic_dst) else: widst, ids2wt = data.read_dic(opt.train_dst, max_size=opt.trg_vocab_size) data.save_dic(opt.exp_name + '_trg_dic.txt', widst) # Read training trainings_data = data.read_corpus(opt.train_src, widss) trainingt_data = data.read_corpus(opt.train_dst, widst) # Read validation valids_data = data.read_corpus(opt.valid_src, widss) validt_data = data.read_corpus(opt.valid_dst, widst) # Create model ====================================================== if opt.verbose: print('Creating model') sys.stdout.flush() s2s = seq2seq.Seq2SeqModel(opt.emb_dim, opt.hidden_dim, opt.att_dim, widss, widst, model_file=opt.model, bidir=opt.bidir, word_emb=opt.word_emb, dropout=opt.dropout_rate, max_len=opt.max_len) if s2s.model_file is not None: s2s.load() s2s.model_file = opt.exp_name+'_model.txt' # Trainer ========================================================== if opt.trainer == 'sgd': trainer = dy.SimpleSGDTrainer( s2s.model, e0=opt.learning_rate, edecay=opt.learning_rate_decay) if opt.trainer == 'clr': trainer = dy.CyclicalSGDTrainer(s2s.model, e0_min=opt.learning_rate / 10, e0_max=opt.learning_rate, edecay=opt.learning_rate_decay) elif opt.trainer == 'momentum': trainer = dy.MomentumSGDTrainer( s2s.model, e0=opt.learning_rate, edecay=opt.learning_rate_decay) elif opt.trainer == 'rmsprop': trainer = dy.RMSPropTrainer(s2s.model, e0=opt.learning_rate, edecay=opt.learning_rate_decay) elif opt.trainer == 'adam': trainer = dy.AdamTrainer(s2s.model, opt.learning_rate, edecay=opt.learning_rate_decay) else: print('Trainer name invalid or not provided, using SGD', file=sys.stderr) trainer = dy.SimpleSGDTrainer( s2s.model, e0=opt.learning_rate, edecay=opt.learning_rate_decay) if opt.verbose: print('Using '+opt.trainer+' optimizer') trainer.set_clip_threshold(opt.gradient_clip) # Print configuration =============================================== if opt.verbose: options.print_config(opt, src_dict_size=len(widss), trg_dict_size=len(widst)) sys.stdout.flush() # Creat batch loaders =============================================== if opt.verbose: print('Creating batch loaders') sys.stdout.flush() trainbatchloader = data.BatchLoader(trainings_data, trainingt_data, opt.batch_size) devbatchloader = data.BatchLoader(valids_data, validt_data, opt.dev_batch_size) # Start training ==================================================== if opt.verbose: print('starting training') sys.stdout.flush() start = time.time() train_loss = 0 processed = 0 best_bleu = 0 i = 0 for epoch in range(opt.num_epochs): for x, y in trainbatchloader: processed += sum(map(len, y)) bsize = len(y) # Compute loss loss = s2s.calculate_loss(x, y) # Backward pass and parameter update loss.backward() trainer.update() train_loss += loss.scalar_value() * bsize if (i+1) % opt.check_train_error_every == 0: # Check average training error from time to time logloss = train_loss / processed ppl = np.exp(logloss) elapsed = time.time()-start trainer.status() print(" Training_loss=%f, ppl=%f, time=%f s, tokens processed=%d" % (logloss, ppl, elapsed, processed)) start = time.time() train_loss = 0 processed = 0 sys.stdout.flush() if (i+1) % opt.check_valid_error_every == 0: # Check generalization error on the validation set from time to time dev_loss = 0 dev_processed = 0 dev_start = time.time() for x, y in devbatchloader: dev_processed += sum(map(len, y)) bsize = len(y) loss = s2s.calculate_loss(x, y, test=True) dev_loss += loss.scalar_value() * bsize dev_logloss = dev_loss/dev_processed dev_ppl = np.exp(dev_logloss) dev_elapsed = time.time()-dev_start print("[epoch %d] Dev loss=%f, ppl=%f, time=%f s, tokens processed=%d" % (epoch, dev_logloss, dev_ppl, dev_elapsed, dev_processed)) sys.stdout.flush() start = time.time() if (i+1) % opt.valid_bleu_every == 0: # Check BLEU score on the validation set from time to time print('Start translating validation set, buckle up!') sys.stdout.flush() bleu_start = time.time() with open(opt.valid_out, 'w+') as f: for x in valids_data: y_hat = s2s.translate(x, beam_size=opt.beam_size) translation = [ids2wt[w] for w in y_hat[1:-1]] print(' '.join(translation), file=f) bleu, details = evaluation.bleu_score(opt.valid_dst, opt.valid_out) bleu_elapsed = time.time()-bleu_start print('Finished translating validation set', bleu_elapsed, 'elapsed.') print(details) # Early stopping : save the latest best model if bleu > best_bleu: best_bleu = bleu print('Best BLEU score up to date, saving model to', s2s.model_file) s2s.save() sys.stdout.flush() start = time.time() i = i+1 trainer.update_epoch()
def __init__(self, params, vocab, embeddings): """ :param params: parameters :param vocab: vocabulary :param embeddings: pretrained word embeddings """ self.params = params self.name = 'lstm_crf' self.dim_char = params.dim_char self.dim_w = params.dim_w self.dim_char_h = params.dim_char_h self.dim_ote_h = params.dim_ote_h self.dim_ts_h = params.dim_ts_h self.input_win = params.input_win self.ds_name = params.ds_name # tag vocabulary of opinion target extraction and targeted sentiment self.ote_tag_vocab = params.ote_tag_vocab self.ts_tag_vocab = params.ts_tag_vocab self.dim_ote_y = len(self.ote_tag_vocab) self.dim_ts_y = len(self.ts_tag_vocab) self.n_epoch = params.n_epoch self.dropout_rate = params.dropout self.tagging_schema = params.tagging_schema self.clip_grad = params.clip_grad self.use_char = params.use_char # name of word embeddings self.emb_name = params.emb_name self.embeddings = embeddings self.vocab = vocab # character vocabulary self.char_vocab = params.char_vocab self.pc = dy.ParameterCollection() # word embedding layer self.emb = WDEmb(pc=self.pc, n_words=len(vocab), dim_w=self.dim_w, pretrained_embeddings=embeddings) # input dimension dim_input = self.input_win * self.dim_w self.lstm_ts = dy.LSTMBuilder(1, dim_input, self.dim_ts_h, self.pc) # hidden layer between LSTM and CRF decoding layer self.hidden = Linear(pc=self.pc, n_in=2 * self.dim_ts_h, n_out=self.dim_ts_h, use_bias=True, nonlinear='tanh') # map the word representation to the ts label space # in the label space, both BEG and END tag are considered self.fc_ts = Linear(pc=self.pc, n_in=self.dim_ts_h, n_out=self.dim_ts_y) # transition matrix, [i, j] is the transition score from tag i to tag j self.transitions = self.pc.add_lookup_parameters( (self.dim_ts_y + 2, self.dim_ts_y + 2)) # determine the optimizer if params.optimizer == 'sgd': self.optimizer = dy.SimpleSGDTrainer(self.pc, params.sgd_lr) elif params.optimizer == 'adam': self.optimizer = dy.AdamTrainer(self.pc, 0.001, 0.9, 0.9) elif params.optimizer == 'adadelta': self.optimizer = dy.AdadeltaTrainer(self.pc) elif params.optimizer == 'momentum': self.optimizer = dy.MomentumSGDTrainer(self.pc, 0.01, 0.9) else: raise Exception("Unsupported optimizer type: %s" % params.optimizer)
def __init__(self, yaml_context, e0=0.01, mom=0.9): self.optimizer = dy.MomentumSGDTrainer( yaml_context.dynet_param_collection.param_col, e0, mom)
def finetune(self, best_scores): # freeze all encoders self.encoders['feat'].set_freeze(True) for task in self.args.tasks: # self.decoders[task].tree_encoder.set_freeze(True) # restart the trainer to clear the momentum from the previous training self.trainer = dy.AdamTrainer(self.model) # load the best model from the previous finetuning self.load_model() self.log(f'Start finetuning {task}') switch_trainer = ( len(self.args.tasks) == 1 ) # directly change to SGD if there is only one task switched = False waited = 0 step = 0 for batch in self.iterate_batch: loss = total = correct = 0 if switch_trainer: self.trainer = dy.MomentumSGDTrainer(self.model) switch_trainer = False switched = True # train on a batch of sentences t0 = time() for sent in tqdm(batch): step += 1 self.encode_sent(sent, True) res = self.decoders[task].train_one_step(sent) sent.clear_pred() loss += res['loss'] total += res['total'] correct += res['correct'] if res['loss_expr']: try: res['loss_expr'].backward() self.trainer.update() except: self.log('bad gradient, load previous model') self.load_model() train_time = time() - t0 # evaluate on dev set res = self.predict(self.dev_sents[:1000], task) score = res['score'] self.log( f"[step={step}]\ttrain_time={train_time:.1f}s\ttest_time={res[f'time']:.1f}s" ) self.log(f"[step={step}]\t{task}_loss={loss/self.args.eval_every:.2f}, "\ f"train_{task}_score={correct}/{total}={100*correct/total:.2f}") self.log(f"[step={step}]\tdev_{task}_score={100*score:.2f}") if score > best_scores[task]: best_scores[task] = score self.save_model() waited = 0 else: waited += 1 if waited > self.args.patience: if switched: self.log('out of patience') break else: self.log('switch trainer') switch_trainer = True waited = 0 if step >= self.args.max_step: break self.log(f'Finish finetuning {task}') self.decoders[task].set_freeze(True)
def __init__(self, params, vocab, label2tag, pretrained_embeddings=None): """ :param params: :param vocab: :param label2tag: :param pretrained_embeddings: """ self.dim_w = params.dim_w self.win = params.win self.vocab = vocab self.n_words = len(self.vocab) self.dim_asp = params.dim_asp self.dim_opi = params.dim_opi self.dim_y_asp = params.n_asp_tags self.dim_y_opi = params.n_opi_tags self.n_steps = params.n_steps self.asp_label2tag = label2tag self.opi_label2tag = {0: 'O', 1: 'T'} self.dropout_asp = params.dropout_asp self.dropout_opi = params.dropout_opi self.dropout = params.dropout self.rnn_type = params.rnn_type self.ds_name = params.ds_name self.model_name = params.model_name self.attention_type = params.attention_type self.pc = dy.ParameterCollection() self.Emb = WDEmb(pc=self.pc, n_words=self.n_words, dim_w=self.dim_w, pretrained_embeddings=pretrained_embeddings) #self.ASP_RNN = LSTM(pc=self.pc, n_in=self.win*self.dim_w, n_out=self.dim_asp, dropout_rate=self.dropout_asp) #self.OPI_RNN = LSTM(pc=self.pc, n_in=self.win*self.dim_w, n_out=self.dim_opi, dropout_rate=self.dropout_opi) # use dynet RNNBuilder rather than the self-defined RNN classes if self.rnn_type == 'LSTM': self.ASP_RNN = dy.LSTMBuilder(1, self.win * self.dim_w, self.dim_asp, self.pc) self.OPI_RNN = dy.LSTMBuilder(1, self.win * self.dim_w, self.dim_opi, self.pc) elif self.rnn_type == 'GRU': # NOT TRIED! self.ASP_RNN = dy.GRUBuilder(1, self.win * self.dim_w, self.dim_asp, self.pc) self.OPI_RNN = dy.GRUBuilder(1, self.win * self.dim_w, self.dim_opi, self.pc) else: raise Exception("Invalid RNN type!!!") self.THA = THA(pc=self.pc, n_steps=self.n_steps, n_in=2*self.dim_asp) if self.attention_type == 'bilinear': self.STN = ST_bilinear(pc=self.pc, dim_asp=self.dim_asp, dim_opi=self.dim_opi) # here dot attention is not applicable since the aspect representation and opinion representation # have different dimensions # elif self.attention_type == 'dot': # self.STN = ST_dot(pc=self.pc, dim_asp=self.dim_asp, dim_opi=self.dim_opi) elif self.attention_type == 'concat': self.STN = ST_concat(pc=self.pc, dim_asp=self.dim_asp, dim_opi=self.dim_opi) else: raise Exception("Invalid attention type!!!") self.ASP_FC = Linear(pc=self.pc, n_in=2*self.dim_asp+2*self.dim_opi, n_out=self.dim_y_asp) self.OPI_FC = Linear(pc=self.pc, n_in=2*self.dim_opi, n_out=self.dim_y_opi) self.layers = [self.ASP_FC, self.OPI_FC, self.THA, self.STN] if params.optimizer == 'sgd': self.optimizer = dy.SimpleSGDTrainer(self.pc, params.sgd_lr) elif params.optimizer == 'momentum': self.optimizer = dy.MomentumSGDTrainer(self.pc, 0.01, 0.9) elif params.optimizer == 'adam': self.optimizer = dy.AdamTrainer(self.pc, 0.001, 0.9, 0.9) elif params.optimizer == 'adagrad': self.optimizer = dy.AdagradTrainer(self.pc) elif params.optimizer == 'adadelta': # use default value of adadelta self.optimizer = dy.AdadeltaTrainer(self.pc) else: raise Exception("Invalid optimizer!!")
training_instances, training_vocab, \ dev_instances, dev_vocab, test_instances, tag_set_sizes = processed_dataset.get_all_params() # ===-----------------------------------------------------------------------=== # Build model and trainer # ===-----------------------------------------------------------------------=== model = LSTMTagger(tagset_sizes=tag_set_sizes, num_lstm_layers=options.lstm_layers, hidden_dim=options.hidden_dim, word_level_dim=options.word_level_dim, charset_size=len(c2i), char_embedding_dim=DEFAULT_CHAR_EMBEDDING_SIZE) trainer = dy.MomentumSGDTrainer(model.model, options.learning_rate, 0.9) logging.info("Training Algorithm: {}".format(type(trainer))) logging.info("Number training instances: {}".format(len(training_instances))) logging.info("Number dev instances: {}".format(len(dev_instances))) best_dev_pos = 0.0 old_best_name = None for epoch in range(options.num_epochs): bar = progressbar.ProgressBar() # set up epoch random.shuffle(training_instances) train_loss = 0.0
def __init__(self, e0=0.01, mom=0.9): self.optimizer = dy.MomentumSGDTrainer( ParamManager.global_collection(), e0, mom)