def _init_optimizer(self, model, **kwargs): mom = float(kwargs.get('mom', 0.0)) optim = kwargs.get('optim', 'sgd') clip = kwargs.get('clip') self.current_lr = kwargs.get('eta', kwargs.get('lr', 0.01)) if optim == 'adadelta': self.optimizer = dy.AdadeltaTrainer(model.pc) elif optim == 'adam': self.optimizer = dy.AdamTrainer(model.pc, alpha=self.current_lr, beta_1=kwargs.get('beta1', 0.9), beta_2=kwargs.get('beta2', 0.999), eps=kwargs.get('epsilon', 1e-8)) elif optim == 'rmsprop': self.optimizer = dy.RMSPropTrainer(model.pc, learning_rate=self.current_lr) else: if mom == 0 or mom is None: self.optimizer = dy.SimpleSGDTrainer( model.pc, learning_rate=self.current_lr) else: logging.info('Using mom %f', mom) self.optimizer = dy.MomentumSGDTrainer( model.pc, learning_rate=self.current_lr, mom=mom) if clip is not None: self.optimizer.set_clip_threshold(clip) self.optimizer.set_sparse_updates(False)
def __init__(self, eps: numbers.Real = 1e-6, rho: numbers.Real = 0.95, skip_noisy: bool = False) -> None: super().__init__(optimizer=dy.AdadeltaTrainer( ParamManager.global_collection(), eps, rho), skip_noisy=skip_noisy)
def train(self): trainer = dy.AdadeltaTrainer(self.model) best_acc, repeat = 0.0, 0 for epoch in range(self.config.epochs): dy.renew_cg() losses = [] closs = 0.0 for i, traininst in enumerate(self.trainset): pre_context = [self.EOS] + traininst['pre_context'] pos_context = traininst['pos_context'] + [self.EOS] refex = [w.lower() for w in traininst['refex'] ] if self.lowercase else traininst['refex'] refex = [self.EOS] + refex + [self.EOS] entity = traininst['entity'] entity_tokens = entity.replace('\"', '').replace('\'', '').replace( ',', '').split('_') loss = self.get_loss(pre_context, pos_context, refex, entity, entity_tokens) losses.append(loss) if len(losses) == self.config.batch: loss = dy.esum(losses) closs += loss.value() loss.backward() trainer.update() dy.renew_cg() print("Epoch: {0} \t Loss: {1} \t Progress: {2}".format( epoch, round(closs / self.config.batch, 2), round(i / len(self.trainset), 2)), end=' \r') losses = [] closs = 0.0 outputs, num, dem = self.validate() acc = round(float(num) / dem, 2) print("Dev acc: {0} \t Best acc: {1}".format( str(num / dem), best_acc)) # Saving the model with best accuracy if best_acc == 0.0 or acc > best_acc: best_acc = acc self.logger.save_result(fname='dev_best', results=outputs, beam=self.config.beam) self.model.save(self.logger.model_path) repeat = 0 else: repeat += 1 # In case the accuracy does not increase in 20 epochs, break the process if repeat == self.config.early_stop: break
def train(self): trainer = dy.AdadeltaTrainer(self.model) log = [] best_acc, repeat = 0.0, 0 for epoch in range(self.config.epochs): dy.renew_cg() losses = [] closs = 0.0 for i, traininst in enumerate(self.trainset): pre_context = [self.EOS] + traininst['pre_context'] pos_context = traininst['pos_context'] + [self.EOS] refex = [self.EOS] + traininst['refex'] + [self.EOS] entity = traininst['entity'] loss = self.get_loss(pre_context, pos_context, refex, entity) losses.append(loss) if len(losses) == self.config.batch: loss = dy.esum(losses) closs += loss.value() loss.backward() trainer.update() dy.renew_cg() print("Epoch: {0} \t Loss: {1} \t Progress: {2}".format( epoch, (closs / self.config.batch), round(i / len(self.trainset), 2)), end=' \r') losses = [] closs = 0.0 outputs, num, dem = self.validate() acc = float(num) / dem log.append(acc) print("Dev acc: {0} \t Best acc: {1}".format( round(acc, 2), best_acc)) # Saving the model with best accuracy if best_acc == 0.0 or acc > best_acc: best_acc = acc fname = 'dev_best.txt' self.write(os.path.join(self.path, fname), outputs) fname = 'best_model.dy' self.model.save(os.path.join(self.path, fname)) repeat = 0 else: repeat += 1 # In case the accuracy does not increase in 20 epochs, break the process if repeat == self.config.early_stop: break json.dump(log, open(os.path.join(self.path, 'log.json'), 'w'))
def __init__(self, params, vocab, label2tag, pretrained_embeddings=None): """ :param params: :param vocab: :param label2tag: :param pretrained_embeddings: """ self.dim_w = params.dim_w self.win = params.win self.vocab = vocab self.n_words = len(self.vocab) self.dim_asp = params.dim_asp self.dim_y_asp = params.n_asp_tags self.n_steps = params.n_steps self.asp_label2tag = label2tag self.dropout_asp = params.dropout_asp self.dropout = params.dropout self.ds_name = params.ds_name self.model_name = params.model_name self.attention_type = params.attention_type self.pc = dy.ParameterCollection() self.Emb = WDEmb(pc=self.pc, n_words=self.n_words, dim_w=self.dim_w, pretrained_embeddings=pretrained_embeddings) self.DEP_RecNN = DTreeBuilder(pc=self.pc, n_in=self.win * self.dim_w, n_out=self.dim_asp, dropout_rate=self.dropout_asp) self.ASP_RNN = dy.LSTMBuilder(1, self.win * self.dim_w, self.dim_asp, self.pc) self.BiAttention_F=BiAttention(pc=self.pc, n_in=self.dim_asp, n_out=self.dim_asp, dropout_rate=self.dropout_asp) self.BiAttention_B=BiAttention(pc=self.pc, n_in=self.dim_asp, n_out=self.dim_asp, dropout_rate=self.dropout_asp) self.BiAttention_T=BiAttention(pc=self.pc, n_in=self.dim_asp, n_out=self.dim_asp, dropout_rate=self.dropout_asp) self.MultiWeightLayer=MultiWeightLayer(pc=self.pc, n_in=self.dim_asp, n_out=self.dim_asp, dropout_rate=self.dropout_asp) self.ASP_FC = Linear(pc=self.pc, n_in=self.dim_asp, n_out=self.dim_y_asp) self.layers = [self.ASP_FC,self.DEP_RecNN,self.BiAttention_F,self.BiAttention_B,self.BiAttention_T,self.MultiWeightLayer] if params.optimizer == 'sgd': self.optimizer = dy.SimpleSGDTrainer(self.pc, params.sgd_lr) elif params.optimizer == 'momentum': self.optimizer = dy.MomentumSGDTrainer(self.pc, 0.01, 0.9) elif params.optimizer == 'adam': self.optimizer = dy.AdamTrainer(self.pc, 0.001, 0.9, 0.9) elif params.optimizer == 'adagrad': self.optimizer = dy.AdagradTrainer(self.pc) elif params.optimizer == 'adadelta': self.optimizer = dy.AdadeltaTrainer(self.pc) else: raise Exception("Invalid optimizer!!")
def __init__( self, bigrams_size, unigrams_size, bigrams_dims, unigrams_dims, lstm_units, hidden_units, label_size, span_nums, droprate=0, ): self.bigrams_size = bigrams_size self.bigrams_dims = bigrams_dims self.unigrams_dims = unigrams_dims self.unigrams_size = unigrams_size self.lstm_units = lstm_units self.hidden_units = hidden_units self.span_nums = span_nums self.droprate = droprate self.label_size = label_size self.model = dynet.Model() self.trainer = dynet.AdadeltaTrainer(self.model, eps=1e-7, rho=0.99) random.seed(1) self.activation = dynet.rectify self.bigram_embed = self.model.add_lookup_parameters( (self.bigrams_size, self.bigrams_dims), ) self.unigram_embed = self.model.add_lookup_parameters( (self.unigrams_size, self.unigrams_dims), ) self.fwd_lstm1 = LSTM(self.bigrams_dims + self.unigrams_dims, self.lstm_units, self.model) self.back_lstm1 = LSTM(self.bigrams_dims + self.unigrams_dims, self.lstm_units, self.model) self.fwd_lstm2 = LSTM(2 * self.lstm_units, self.lstm_units, self.model) self.back_lstm2 = LSTM(2 * self.lstm_units, self.lstm_units, self.model) self.p_hidden_W = self.model.add_parameters( (self.hidden_units, 2 * self.span_nums * self.lstm_units), dynet.UniformInitializer(0.01)) self.p_hidden_b = self.model.add_parameters((self.hidden_units, ), dynet.ConstInitializer(0)) self.p_output_W = self.model.add_parameters( (self.label_size, self.hidden_units), dynet.ConstInitializer(0)) self.p_output_b = self.model.add_parameters((self.label_size, ), dynet.ConstInitializer(0))
def entailment(train_file, dev_file, test_file, embed_file, epochs, eps, reg_lambda, batch_size, per_log, LSTM_params, training_sample, sample_type, improvement): curr_time = strftime("%Y-%m-%d %H:%M:%S", gmtime()) print(curr_time + ": starting process") # read train and dev data sets train, train_words, max_len_train = read_data( train_file ) # read train data to list. each list item is a sentence. each sentence is a tuple dev, dev_words, max_len_dev = read_data( dev_file ) # read train data to list. each list item is a sentence. each sentence is a tuple test, test_words, max_len_test = read_data( test_file ) # read train data to list. each list item is a sentence. each sentence is a tuple P_rows = max([max_len_train, max_len_dev, max_len_test]) # unify all unique words to one set and delete independent sets all_words = train_words.union(dev_words).union(test_words) del train_words del dev_words del test_words # get embeddings embed_vec, vocab = get_embeddings(embed_file, all_words, LSTM_params[2]) # define vocabulary and help structures word2int = {w: i for i, w in enumerate(vocab)} label2int = { l: i for i, l in enumerate(["entailment", "neutral", "contradiction"]) } vocab_size = len(vocab) num_labels = 3 # create a classifier m = dy.ParameterCollection() trainer = dy.AdadeltaTrainer(m, eps) # define trainer snli_classifier = ReRead_LSTM(vocab_size, num_labels, LSTM_params, embed_vec, P_rows, m, improvement) # create classifier train_model(train, dev, test, epochs, batch_size, reg_lambda, trainer, snli_classifier, word2int, label2int, per_log, training_sample, sample_type, improvement)
def __init__(self, model, optim='sgd', clip=5, mom=0.9, **kwargs): super(ClassifyTrainerDynet, self).__init__() self.model = model eta = kwargs.get('eta', kwargs.get('lr', 0.01)) print("Using eta [{:.4f}]".format(eta)) print("Using optim [{}]".format(optim)) self.labels = model.labels if optim == 'adadelta': self.optimizer = dy.AdadeltaTrainer(model.pc) elif optim == 'adam': self.optimizer = dy.AdamTrainer(model.pc) elif optim == 'rmsprop': self.optimizer = dy.RMSPropTrainer(model.pc, learning_rate=eta) else: print("using mom {:.3f}".format(mom)) self.optimizer = dy.MomentumSGDTrainer(model.pc, learning_rate=eta, mom=mom) self.optimizer.set_clip_threshold(clip)
def optimizer(model, optim='sgd', eta=0.01, clip=None, mom=0.9, **kwargs): if 'lr' in kwargs: eta = kwargs['lr'] print('Using eta [{:.4f}]'.format(eta)) print('Using optim [{}]'.format(optim)) if optim == 'adadelta': opt = dy.AdadeltaTrainer(model.pc) elif optim == 'adam': opt = dy.AdamTrainer(model.pc) elif optim == 'rmsprop': opt = dy.RMSPropTrainer(model.pc, learning_rate=eta) else: if mom == 0 or mom is None: opt = dy.SimpleSGDTrainer(model.pc, learning_rate=eta) else: print('Using mom {:.3f}'.format(mom)) opt = dy.MomentumSGDTrainer(model.pc, learning_rate=eta, mom=mom) if clip is not None: opt.set_clip_threshold(clip) opt.set_sparse_updates(False) return opt
def __init__(self, params, vocab, label2tag, pretrained_embeddings=None): """ :param params: :param vocab: :param label2tag: :param pretrained_embeddings: """ self.dim_w = params.dim_w self.win = params.win self.vocab = vocab self.n_words = len(self.vocab) self.dim_asp = params.dim_asp self.dim_opi = params.dim_opi self.dim_y_asp = params.n_asp_tags self.dim_y_opi = params.n_opi_tags self.n_steps = params.n_steps self.asp_label2tag = label2tag self.opi_label2tag = {0: 'O', 1: 'T'} self.dropout_asp = params.dropout_asp self.dropout_opi = params.dropout_opi self.dropout = params.dropout self.rnn_type = params.rnn_type self.ds_name = params.ds_name self.model_name = params.model_name self.attention_type = params.attention_type self.pc = dy.ParameterCollection() self.Emb = WDEmb(pc=self.pc, n_words=self.n_words, dim_w=self.dim_w, pretrained_embeddings=pretrained_embeddings) #self.ASP_RNN = LSTM(pc=self.pc, n_in=self.win*self.dim_w, n_out=self.dim_asp, dropout_rate=self.dropout_asp) #self.OPI_RNN = LSTM(pc=self.pc, n_in=self.win*self.dim_w, n_out=self.dim_opi, dropout_rate=self.dropout_opi) # use dynet RNNBuilder rather than the self-defined RNN classes if self.rnn_type == 'LSTM': self.ASP_RNN = dy.LSTMBuilder(1, self.win * self.dim_w, self.dim_asp, self.pc) self.OPI_RNN = dy.LSTMBuilder(1, self.win * self.dim_w, self.dim_opi, self.pc) elif self.rnn_type == 'GRU': # NOT TRIED! self.ASP_RNN = dy.GRUBuilder(1, self.win * self.dim_w, self.dim_asp, self.pc) self.OPI_RNN = dy.GRUBuilder(1, self.win * self.dim_w, self.dim_opi, self.pc) else: raise Exception("Invalid RNN type!!!") self.THA = THA(pc=self.pc, n_steps=self.n_steps, n_in=2*self.dim_asp) if self.attention_type == 'bilinear': self.STN = ST_bilinear(pc=self.pc, dim_asp=self.dim_asp, dim_opi=self.dim_opi) # here dot attention is not applicable since the aspect representation and opinion representation # have different dimensions # elif self.attention_type == 'dot': # self.STN = ST_dot(pc=self.pc, dim_asp=self.dim_asp, dim_opi=self.dim_opi) elif self.attention_type == 'concat': self.STN = ST_concat(pc=self.pc, dim_asp=self.dim_asp, dim_opi=self.dim_opi) else: raise Exception("Invalid attention type!!!") self.ASP_FC = Linear(pc=self.pc, n_in=2*self.dim_asp+2*self.dim_opi, n_out=self.dim_y_asp) self.OPI_FC = Linear(pc=self.pc, n_in=2*self.dim_opi, n_out=self.dim_y_opi) self.layers = [self.ASP_FC, self.OPI_FC, self.THA, self.STN] if params.optimizer == 'sgd': self.optimizer = dy.SimpleSGDTrainer(self.pc, params.sgd_lr) elif params.optimizer == 'momentum': self.optimizer = dy.MomentumSGDTrainer(self.pc, 0.01, 0.9) elif params.optimizer == 'adam': self.optimizer = dy.AdamTrainer(self.pc, 0.001, 0.9, 0.9) elif params.optimizer == 'adagrad': self.optimizer = dy.AdagradTrainer(self.pc) elif params.optimizer == 'adadelta': # use default value of adadelta self.optimizer = dy.AdadeltaTrainer(self.pc) else: raise Exception("Invalid optimizer!!")
def __init__(self, params, vocab, embeddings): """ :param params: parameters :param vocab: vocabulary :param embeddings: pretrained word embeddings """ self.params = params self.name = 'lstm_crf' self.dim_char = params.dim_char self.dim_w = params.dim_w self.dim_char_h = params.dim_char_h self.dim_ote_h = params.dim_ote_h self.dim_ts_h = params.dim_ts_h self.input_win = params.input_win self.ds_name = params.ds_name # tag vocabulary of opinion target extraction and targeted sentiment self.ote_tag_vocab = params.ote_tag_vocab self.ts_tag_vocab = params.ts_tag_vocab self.dim_ote_y = len(self.ote_tag_vocab) self.dim_ts_y = len(self.ts_tag_vocab) self.n_epoch = params.n_epoch self.dropout_rate = params.dropout self.tagging_schema = params.tagging_schema self.clip_grad = params.clip_grad self.use_char = params.use_char # name of word embeddings self.emb_name = params.emb_name self.embeddings = embeddings self.vocab = vocab # character vocabulary self.char_vocab = params.char_vocab self.pc = dy.ParameterCollection() # word embedding layer self.emb = WDEmb(pc=self.pc, n_words=len(vocab), dim_w=self.dim_w, pretrained_embeddings=embeddings) # input dimension dim_input = self.input_win * self.dim_w self.lstm_ts = dy.LSTMBuilder(1, dim_input, self.dim_ts_h, self.pc) # hidden layer between LSTM and CRF decoding layer self.hidden = Linear(pc=self.pc, n_in=2 * self.dim_ts_h, n_out=self.dim_ts_h, use_bias=True, nonlinear='tanh') # map the word representation to the ts label space # in the label space, both BEG and END tag are considered self.fc_ts = Linear(pc=self.pc, n_in=self.dim_ts_h, n_out=self.dim_ts_y) # transition matrix, [i, j] is the transition score from tag i to tag j self.transitions = self.pc.add_lookup_parameters( (self.dim_ts_y + 2, self.dim_ts_y + 2)) # determine the optimizer if params.optimizer == 'sgd': self.optimizer = dy.SimpleSGDTrainer(self.pc, params.sgd_lr) elif params.optimizer == 'adam': self.optimizer = dy.AdamTrainer(self.pc, 0.001, 0.9, 0.9) elif params.optimizer == 'adadelta': self.optimizer = dy.AdadeltaTrainer(self.pc) elif params.optimizer == 'momentum': self.optimizer = dy.MomentumSGDTrainer(self.pc, 0.01, 0.9) else: raise Exception("Unsupported optimizer type: %s" % params.optimizer)
def __init__(self, params, vocab, embeddings, char_embeddings): """ :param params: :param vocab: :param embeddings: :param char_embeddings: """ self.params = params self.name = 'lstm_cascade' self.dim_char = params.dim_char self.dim_w = params.dim_w self.dim_char_h = params.dim_char_h self.dim_ote_h = params.dim_ote_h self.dim_ts_h = params.dim_ts_h self.input_win = params.input_win self.ds_name = params.ds_name # tag vocabulary of opinion target extraction and targeted sentiment self.ote_tag_vocab = params.ote_tag_vocab self.ts_tag_vocab = params.ts_tag_vocab self.dim_ote_y = len(self.ote_tag_vocab) self.dim_ts_y = len(self.ts_tag_vocab) self.n_epoch = params.n_epoch self.dropout_rate = params.dropout self.tagging_schema = params.tagging_schema self.clip_grad = params.clip_grad self.use_char = params.use_char # name of word embeddings self.emb_name = params.emb_name self.embeddings = embeddings self.vocab = vocab # character vocabulary self.char_vocab = params.char_vocab #self.td_proportions = params.td_proportions self.epsilon = params.epsilon #self.tc_proportions = params.tc_proportions self.pc = dy.ParameterCollection() if self.use_char: self.char_emb = CharEmb(pc=self.pc, n_chars=len(self.char_vocab), dim_char=self.dim_char, pretrained_embeddings=char_embeddings) self.lstm_char = dy.LSTMBuilder(1, self.dim_char, self.dim_char_h, self.pc) dim_input = self.input_win * self.dim_w + 2 * self.dim_char_h else: dim_input = self.input_win * self.dim_w # word embedding layer self.emb = WDEmb(pc=self.pc, n_words=len(vocab), dim_w=self.dim_w, pretrained_embeddings=embeddings) # lstm layers self.lstm_ote = dy.LSTMBuilder(1, dim_input, self.dim_ote_h, self.pc) self.lstm_ts = dy.LSTMBuilder(1, 2 * self.dim_ote_h, self.dim_ts_h, self.pc) # fully connected layer self.fc_ote = Linear(pc=self.pc, n_in=2 * self.dim_ote_h, n_out=self.dim_ote_y) self.fc_ts = Linear(pc=self.pc, n_in=2 * self.dim_ts_h, n_out=self.dim_ts_y) assert self.tagging_schema == 'BIEOS' transition_path = { 'B': ['B-POS', 'B-NEG', 'B-NEU'], 'I': ['I-POS', 'I-NEG', 'I-NEU'], 'E': ['E-POS', 'E-NEG', 'E-NEU'], 'S': ['S-POS', 'S-NEG', 'S-NEU'], 'O': ['O'] } self.transition_scores = np.zeros((self.dim_ote_y, self.dim_ts_y)) for t in transition_path: next_tags = transition_path[t] n_next_tag = len(next_tags) ote_id = self.ote_tag_vocab[t] for nt in next_tags: ts_id = self.ts_tag_vocab[nt] self.transition_scores[ote_id][ts_id] = 1.0 / n_next_tag print(self.transition_scores) self.transition_scores = np.array(self.transition_scores, dtype='float32').transpose() # opinion target-opinion words co-occurrence modeling self.stm_lm = Linear(pc=self.pc, n_in=2 * self.dim_ote_h, n_out=2 * self.dim_ote_h, nonlinear='tanh') # fully connected layer for opinion-enhanced indicator prediction task self.fc_stm = Linear(pc=self.pc, n_in=2 * self.dim_ote_h, n_out=2) # gate for maintaining sentiment consistency self.W_gate = self.pc.add_parameters( (2 * self.dim_ote_h, 2 * self.dim_ote_h), init=dy.UniformInitializer(0.2)) # determine the optimizer if params.optimizer == 'sgd': self.optimizer = dy.SimpleSGDTrainer(self.pc, params.sgd_lr) elif params.optimizer == 'adam': self.optimizer = dy.AdamTrainer(self.pc, 0.001, 0.9, 0.9) elif params.optimizer == 'adadelta': self.optimizer = dy.AdadeltaTrainer(self.pc) elif params.optimizer == 'momentum': self.optimizer = dy.MomentumSGDTrainer(self.pc, 0.01, 0.9) else: raise Exception("Unsupported optimizer type: %s" % params.optimizer)
def __init__(self, eps=1e-6, rho=0.95): self.optimizer = dy.AdadeltaTrainer(ParamManager.global_collection(), eps, rho)
def main(args: argparse.Namespace): dargs = args.__dict__ for key, value in dargs.items(): logging.info("%s: %s", str(key).ljust(15), value) os.makedirs(args.output) if args.nfd: logging.info("Will perform training on NFD-normalized data.") else: logging.info("Will perform training on unnormalized data.") vocabulary_ = vocabulary.Vocabularies() training_data = [] with utils.OpenNormalize(args.train, args.nfd) as f: for line in f: input_, target = line.rstrip().split("\t", 1) encoded_input = vocabulary_.encode_input(input_) vocabulary_.encode_actions(target) sample = utils.Sample(input_, target, encoded_input) training_data.append(sample) logging.info("%d actions: %s", len(vocabulary_.actions), vocabulary_.actions) logging.info("%d chars: %s", len(vocabulary_.characters), vocabulary_.characters) vocabulary_path = os.path.join(args.output, "vocabulary.pkl") vocabulary_.persist(vocabulary_path) logging.info("Wrote vocabulary to %s.", vocabulary_path) development_data = [] with utils.OpenNormalize(args.dev, args.nfd) as f: for line in f: input_, target = line.rstrip().split("\t", 1) encoded_input = vocabulary_.encode_unseen_input(input_) sample = utils.Sample(input_, target, encoded_input) development_data.append(sample) if args.test is not None: test_data = [] with utils.OpenNormalize(args.test, args.nfd) as f: for line in f: input_, *optional_target = line.rstrip().split("\t", 1) target = optional_target[0] if optional_target else None encoded_input = vocabulary_.encode_unseen_input(input_) sample = utils.Sample(input_, target, encoded_input) test_data.append(sample) sed_parameters_path = os.path.join(args.output, "sed.pkl") sed_aligner = sed.StochasticEditDistance.fit_from_data( training_data, em_iterations=args.sed_em_iterations, output_path=sed_parameters_path, ) expert = optimal_expert_substitutions.OptimalSubstitutionExpert( sed_aligner) model = dy.Model() transducer_ = transducer.Transducer(model, vocabulary_, expert, **dargs) widgets = [progressbar.Bar(">"), " ", progressbar.ETA()] train_progress_bar = progressbar.ProgressBar(widgets=widgets, maxval=args.epochs).start() train_log_path = os.path.join(args.output, "train.log") best_model_path = os.path.join(args.output, "best.model") with open(train_log_path, "w") as w: w.write("epoch\tavg_loss\ttrain_accuracy\tdev_accuracy\n") trainer = dy.AdadeltaTrainer(model) train_subset = training_data[:100] rollin_schedule = inverse_sigmoid_schedule(args.k) max_patience = args.patience batch_size = args.batch_size logging.info( "Training for a maximum of %d with a maximum patience of %d.", args.epochs, max_patience, ) logging.info( "Number of train batches: %d.", math.ceil(len(training_data) / batch_size), ) best_train_accuracy = 0 best_dev_accuracy = 0 best_epoch = 0 patience = 0 for epoch in range(args.epochs): logging.info("Training...") with utils.Timer(): train_loss = 0.0 random.shuffle(training_data) batches = [ training_data[i:i + batch_size] for i in range(0, len(training_data), batch_size) ] rollin = rollin_schedule(epoch) j = 0 for j, batch in enumerate(batches): losses = [] dy.renew_cg() for sample in batch: output = transducer_.transduce( input_=sample.input, encoded_input=sample.encoded_input, target=sample.target, rollin=rollin, external_cg=True, ) losses.extend(output.losses) batch_loss = -dy.average(losses) train_loss += batch_loss.scalar_value() batch_loss.backward() trainer.update() if j > 0 and j % 100 == 0: logging.info("\t\t...%d batches", j) logging.info("\t\t...%d batches", j + 1) avg_loss = train_loss / len(batches) logging.info("Average train loss: %.4f.", avg_loss) logging.info("Evaluating on training data subset...") with utils.Timer(): train_accuracy = decode(transducer_, train_subset).accuracy if train_accuracy > best_train_accuracy: best_train_accuracy = train_accuracy patience += 1 logging.info("Evaluating on development data...") with utils.Timer(): decoding_output = decode(transducer_, development_data) dev_accuracy = decoding_output.accuracy avg_dev_loss = decoding_output.loss if dev_accuracy > best_dev_accuracy: best_dev_accuracy = dev_accuracy best_epoch = epoch patience = 0 logging.info("Found best dev accuracy %.4f.", best_dev_accuracy) model.save(best_model_path) logging.info("Saved new best model to %s.", best_model_path) logging.info( f"Epoch {epoch} / {args.epochs - 1}: train loss: {avg_loss:.4f} " f"dev loss: {avg_dev_loss:.4f} train acc: {train_accuracy:.4f} " f"dev acc: {dev_accuracy:.4f} best train acc: {best_train_accuracy:.4f} " f"best dev acc: {best_dev_accuracy:.4f} best epoch: {best_epoch} " f"patience: {patience} / {max_patience - 1}") log_line = f"{epoch}\t{avg_loss:.4f}\t{train_accuracy:.4f}\t{dev_accuracy:.4f}\n" with open(train_log_path, "a") as a: a.write(log_line) if patience == max_patience: logging.info("Out of patience after %d epochs.", epoch + 1) train_progress_bar.finish() break train_progress_bar.update(epoch) logging.info("Finished training.") if not os.path.exists(best_model_path): sys.exit(0) model = dy.Model() transducer_ = transducer.Transducer(model, vocabulary_, expert, **dargs) model.populate(best_model_path) evaluations = [(development_data, "dev")] if args.test is not None: evaluations.append((test_data, "test")) for data, dataset_name in evaluations: logging.info( "Evaluating best model on %s data using beam search " "(beam width %d)...", dataset_name, args.beam_width, ) with utils.Timer(): greedy_decoding = decode(transducer_, data) utils.write_results( greedy_decoding.accuracy, greedy_decoding.predictions, args.output, args.nfd, dataset_name, dargs=dargs, ) with utils.Timer(): beam_decoding = decode(transducer_, data, args.beam_width) utils.write_results( beam_decoding.accuracy, beam_decoding.predictions, args.output, args.nfd, dataset_name, args.beam_width, dargs=dargs, )
def __init__( self, word_count, tag_count, word_dims, tag_dims, lstm_units, hidden_units, struct_out, label_out, droprate=0, struct_spans=4, label_spans=3, ): self.word_count = word_count self.tag_count = tag_count self.word_dims = word_dims self.tag_dims = tag_dims self.lstm_units = lstm_units self.hidden_units = hidden_units self.struct_out = struct_out self.label_out = label_out self.droprate = droprate self.model = dynet.Model() self.trainer = dynet.AdadeltaTrainer(self.model, eps=1e-7, rho=0.99) random.seed(1) self.activation = dynet.rectify self.word_embed = self.model.add_lookup_parameters( (word_count, word_dims), ) self.tag_embed = self.model.add_lookup_parameters( (tag_count, tag_dims), ) self.fwd_lstm1 = LSTM(word_dims + tag_dims, lstm_units, self.model) self.back_lstm1 = LSTM(word_dims + tag_dims, lstm_units, self.model) self.fwd_lstm2 = LSTM(2 * lstm_units, lstm_units, self.model) self.back_lstm2 = LSTM(2 * lstm_units, lstm_units, self.model) self.struct_hidden_W = self.model.add_parameters( (hidden_units, 4 * struct_spans * lstm_units), dynet.UniformInitializer(0.01), ) self.struct_hidden_b = self.model.add_parameters( (hidden_units, ), dynet.ConstInitializer(0), ) self.struct_output_W = self.model.add_parameters( (struct_out, hidden_units), dynet.ConstInitializer(0), ) self.struct_output_b = self.model.add_parameters( (struct_out, ), dynet.ConstInitializer(0), ) self.label_hidden_W = self.model.add_parameters( (hidden_units, 4 * label_spans * lstm_units), dynet.UniformInitializer(0.01), ) self.label_hidden_b = self.model.add_parameters( (hidden_units, ), dynet.ConstInitializer(0), ) self.label_output_W = self.model.add_parameters( (label_out, hidden_units), dynet.ConstInitializer(0), ) self.label_output_b = self.model.add_parameters( (label_out, ), dynet.ConstInitializer(0), )
def __init__(self, yaml_context, eps=1e-6, rho=0.95): self.optimizer = dy.AdadeltaTrainer( yaml_context.dynet_param_collection.param_col, eps, rho)
def char_train(network, train_set, val_set, test_set, test_set2, train_set_word, val_set_word, test_set_word, test_set2_word, epochs, batch_size, args, tag_to_ix): def get_val_set_loss(network, val_set, val_set_word, val_author_vecs, pretrain, num_basis): loss = [] vae_loss = [0] l2_loss = [0] for i, (input_sentence, output_sentence) in enumerate(val_set): if args.use_vae: l, a, v, l2 = network.get_full_loss(input_sentence, val_set_word[i][0], output_sentence, val_author_vecs[i], pretrain) loss.append(l.value()) vae_loss.append(v.value()) l2_loss.append(l2.value()) else: loss.append( network.get_loss(input_sentence, val_set_word[i][0], output_sentence, val_author_vecs[i], pretrain).value()) dy.renew_cg() return sum(loss) / len(val_set), sum(vae_loss) / len(val_set), sum( l2_loss) / len(val_set) def get_val_set_acc(network, val_set, val_set_word, val_author_vecs, val_author_ids, pretrain, num_basis): evals = [] if args.use_vae: for i, (input_sentence, output_sentence) in enumerate(val_set): evals.append( network.full_evaluate_acc(input_sentence, val_set_word[i][0], output_sentence, val_author_vecs[i], val_author_ids[i], pretrain)) dy.renew_cg() else: for i, (input_sentence, output_sentence) in enumerate(val_set): evals.append( network.evaluate_acc(input_sentence, val_set_word[i][0], output_sentence, val_author_vecs[i], val_author_ids[i], pretrain)) dy.renew_cg() dy.renew_cg() correct = [c for c, t, d, w, cc, e in evals] total = [t for c, t, d, w, cc, e in evals] mean = 0 confidence = 0 oov = [d for c, t, d, w, cc, e in evals] wrong = [w for c, t, d, w, cc, e in evals] correct2 = [cc for c, t, d, w, cc, e in evals] auth_correct = [ c for i, (c, t, d, w, cc, e) in enumerate(evals) if val_author_vecs[i] is not None ] auth_total = [ t for i, (c, t, d, w, cc, e) in enumerate(evals) if val_author_vecs[i] is not None ] non_auth_correct = [ c for i, (c, t, d, w, cc, e) in enumerate(evals) if val_author_vecs[i] is None ] non_auth_total = [ t for i, (c, t, d, w, cc, e) in enumerate(evals) if val_author_vecs[i] is None ] eids = [e for c, t, d, w, cc, e in evals] #unique_eid = set(eids) len_eid = num_basis counts = [] for i in range(len_eid): counts.append(sum([e == i for e in eids])) counts2 = [] for i in range(len_eid): counts2.append( sum([ e == i for j, e in enumerate(eids) if val_author_vecs[j] is not None ])) if sum(non_auth_total) == 0: non_auth_total = [1] return 100.0 * sum(correct) / sum(total), mean, confidence, sum( oov), sum(wrong), sum(correct2), 100.0 * sum(auth_correct) / sum( auth_total), 100.0 * sum(non_auth_correct) / sum( non_auth_total), counts, counts2 #original_set = train_set #train_set = train_set*epochs if args.optimizer == 'adadelta': trainer = dy.AdadeltaTrainer(network.model) trainer.set_clip_threshold(5) elif args.optimizer == 'adam': trainer = dy.AdamTrainer(network.model, alpha=args.lr) trainer.set_clip_threshold(5) elif args.optimizer == 'sgd-momentum': trainer = dy.MomentumSGDTrainer(network.model, learning_rate=args.lr) else: logging.critical('This Optimizer is not valid or not allowed') losses = [] iterations = [] kk = args.pretrain_epochs if args.use_all_networks: args.network = 'follow' train_author_vecs1, dev_author_vecs1, test_author_vecs1, test2_author_vecs1, train_author_ids, dev_author_ids, test_author_ids, test2_author_ids = extract_authorvecs( args) args.network = 'mention' train_author_vecs2, dev_author_vecs2, test_author_vecs2, test2_author_vecs2, _, _, _, _ = extract_authorvecs( args) args.network = 'retweet' train_author_vecs3, dev_author_vecs3, test_author_vecs3, test2_author_vecs3, _, _, _, _ = extract_authorvecs( args) train_author_vecs = [] for i, j, k in zip(train_author_vecs1, train_author_vecs2, train_author_vecs3): train_author_vecs.append((i, j, k)) dev_author_vecs = [] for i, j, k in zip(dev_author_vecs1, dev_author_vecs2, dev_author_vecs3): dev_author_vecs.append((i, j, k)) test_author_vecs = [] for i, j, k in zip(test_author_vecs1, test_author_vecs2, test_author_vecs3): test_author_vecs.append((i, j, k)) test2_author_vecs = [] for i, j, k in zip(test2_author_vecs1, test2_author_vecs2, test2_author_vecs3): test2_author_vecs.append((i, j, k)) else: train_author_vecs, dev_author_vecs, test_author_vecs, test2_author_vecs, train_author_ids, dev_author_ids, test_author_ids, test2_author_ids = extract_authorvecs( args) logging.info('obtained all author vectors ' + str(len(train_author_vecs)) + ' ' + str(len(dev_author_vecs)) + ' ' + str(len(test_author_vecs)) + ' ' + str(len(test2_author_vecs))) batch_loss_vec = [] dy.renew_cg() is_best = 0 best_val = 0 count = 0 count_train = -1 #early_stopping = 0 for epoch in range(epochs): #if early_stopping>args.early_epochs: # break all_inds = [] num_train = int(len(train_set) / args.batch_size + 1) * args.batch_size #prev_time=time.time() for ii in range(num_train): count_train += 1 if count_train == len(train_set): count_train = 0 count += 1 inputs, outputs = train_set[count_train] inputs_word, _ = train_set_word[count_train] ''' data_point = {'inputs':inputs, 'inputs_word':inputs_word, 'outputs':outputs, 'train_author_vecs':train_author_vecs[i]} pickle.dump(data_point,open( "data_pickle/"+str(i)+".p", "wb" )) data_point = pickle.load( open( "data_pickle/"+str(i)+".p", "rb" ) ) inputs = data_point['inputs'] inputs_word = data_point['inputs_word'] outputs = data_point['outputs'] train_author_vec = data_point['train_author_vecs'] ''' #prev_time2 = time.time() #if train_author_vecs[count_train] !=None: vae_loss = 0 if args.use_vae: loss, ind, vae_loss, l2_loss = network.get_full_loss( inputs, inputs_word, outputs, train_author_vecs[count_train], epoch < kk, True) else: loss, ind = network.get_loss(inputs, inputs_word, outputs, train_author_vecs[count_train], epoch < kk, True) #curr_time2 = time.time() #print ('time for one instance: ', curr_time2 - prev_time2) all_inds.append(ind) #print (loss) #a = input() batch_loss_vec.append(loss) if count % batch_size == 0: batch_loss = dy.esum(batch_loss_vec) / batch_size batch_loss.forward() batch_loss.backward() trainer.update() batch_loss_vec = [] dy.renew_cg() count = 0 #logging.info('finished minibatch: %d/%d',ii,num_train) #print ('until here-----') #curr_time = time.time() #print ('time for one epoch training: ', curr_time - prev_time) counts = [] for i in range(args.num_basis): a = [v == i for v in all_inds] counts.append(sum(a)) logging.info('distribution of the data points' + str(counts)) #if ((i+1))%len(original_set) == 0: if args.plots: val_loss = get_val_set_loss(network, val_set, val_set_word, dev_author_vecs, epoch < kk, args.num_basis) losses.append(val_loss) iterations.append(epoch) #dy.renew_cg() #if ((i+1))%len(original_set)==0: train_loss = 0 if args.slow: train_loss, train_vae_loss, train_l2_loss = get_val_set_loss( network, train_set, train_set_word, train_author_vecs, epoch < kk, args.num_basis) if args.write_errors: f = open(args.log_errors_file, 'a') f.write('\n--------- epoch no: --------- ') f.write(str(epoch) + '\n') f.close() f = open(args.log_errors_file, 'a') f.write('\n--------- oct27.train errors: --------- \n') f.close() #prev_time = time.time() trainacc, train_acc, train_confidence, oov_train, wrong_train, correct_train, auth_acc1, non_auth_acc1, eids1, counts21 = get_val_set_acc( network, train_set, train_set_word, train_author_vecs, train_author_ids, epoch < kk, args.num_basis) #curr_time = time.time() #print ('time for acc train: ', curr_time - prev_time) if args.write_errors: f = open(args.log_errors_file, 'a') f.write('\n--------- oct27.dev errors: ---------\n') f.close() val_loss, val_vae_loss, val_l2_loss = 0, 0, 0 val_acc, oov_val, wrong_val, correct_val = 0, 0, 0, 0 if args.slow: pass #val_loss,val_vae_loss = get_val_set_loss(network, val_set, val_set_word, dev_author_vecs,epoch<kk, args.num_basis) #prev_time = time.time() valacc, val_acc, val_confidence, oov_val, wrong_val, correct_val, auth_acc2, non_auth_acc2, eids2, counts22 = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 #valacc, val_acc, val_confidence, oov_val, wrong_val, correct_val, auth_acc2, non_auth_acc2, eids2, counts22 = get_val_set_acc(network, val_set, val_set_word, dev_author_vecs, dev_author_ids, epoch<kk, args.num_basis) #curr_time = time.time() #print ('time for acc val: ', curr_time - prev_time) if args.write_errors: f = open(args.log_errors_file, 'a') f.write('\n--------- oct27.test errors: --------- \n') f.close() test_loss = 0 if args.slow: test_loss, test_vae_loss, test_l2_loss = get_val_set_loss( network, test_set, test_set_word, test_author_vecs, epoch < kk, args.num_basis) #prev_time = time.time() testacc, test_acc, test_confidence, oov_test, wrong_test, correct_test, auth_acc3, non_auth_acc3, eids3, counts23 = get_val_set_acc( network, test_set, test_set_word, test_author_vecs, test_author_ids, epoch < kk, args.num_basis) #curr_time = time.time() #print ('time for acc test: ', curr_time - prev_time) if args.write_errors: f = open(args.log_errors_file, 'a') f.write('\n--------- daily547.test errors: --------- \n') f.close() test_loss2 = 0 if args.slow: test_loss2, test_vae_loss2, test2_l2_loss = get_val_set_loss( network, test_set2, test_set2_word, test2_author_vecs, epoch < kk, args.num_basis) #prev_time = time.time() testacc2, test_acc2, test2_confidence, oov_test2, wrong_test2, correct_test2, auth_acc4, non_auth_acc4, eids4, counts24 = get_val_set_acc( network, test_set2, test_set2_word, test2_author_vecs, test2_author_ids, epoch < kk, args.num_basis) #curr_time = time.time() #print ('time for acc test2: ', curr_time - prev_time) #test_loss2 = get_val_set_loss(network, test_set2, test_set2_word, test_author_vecs, epoch<kk) #test_acc2, oov_test2, wrong_test2, correct_test2, auth_acc4, non_auth_acc4, eids4 = get_val_set_acc(network, test_set2, test_set2_word, test_author_vecs,epoch<kk) #prev_time = time.time() logging.info('epoch %d done', epoch) logging.info( 'train loss: %f, train vae loss: %f, train l2 loss: %f, train acc: %f', train_loss, train_vae_loss, train_l2_loss, trainacc) logging.info( 'val loss: %f, val vae loss: %f, val l2 loss: %f, val acc: %f', val_loss, val_vae_loss, val_l2_loss, valacc) logging.info( 'test loss: %f, test vae loss: %f, test l2 loss: %f, test acc: %f', test_loss, test_vae_loss, test_l2_loss, testacc) logging.info( 'test2 loss: %f, tes2 vae loss: %f, tes2 l2 loss: %f, test2 acc: %f', test_loss2, test_vae_loss2, test2_l2_loss, testacc2) logging.info( ' oov_train: %d/%d, %d, oov_val: %d/%d, %d, oov_test: %d/%d, %d, oov_test2: %d/%d, %d', oov_train, wrong_train, correct_train, oov_val, wrong_val, correct_val, oov_test, wrong_test, correct_test, oov_test2, wrong_test2, correct_test2) logging.info( 'train: author_acc: %f, non_author_acc: %f, ' + str(eids1) + ' ' + str(counts21), auth_acc1, non_auth_acc1) logging.info( 'dev: author_acc: %f, non_author_acc: %f, ' + str(eids2) + ' ' + str(counts22), auth_acc2, non_auth_acc2) logging.info( 'test: author_acc: %f, non_author_acc: %f, ' + str(eids3) + ' ' + str(counts23), auth_acc3, non_auth_acc3) logging.info( 'test2: author_acc: %f, non_author_acc: %f, ' + str(eids4) + ' ' + str(counts24), auth_acc4, non_auth_acc4) if args.plots: test_acc, test_confidence, confusion_matrix, auth_acc, non_auth_acc, eids = get_val_set_acc2( network, test_set, test_set_word, test_author_vecs, epoch < kk, args.num_basis) df_cm = pd.DataFrame(confusion_matrix, index=[i for i in tag_to_ix.keys()], columns=[i for i in tag_to_ix.keys()]) fig = plt.figure(figsize=(10, 7)) sn.heatmap(df_cm, annot=True) fig.savefig('figs/conf_matrix_' + str(epoch) + '.png') #a = input() if args.combine_train_dev: valacc = testacc elif args.combine_train_dev_test: valacc = testacc2 else: valacc = valacc m = network.model if epoch == 0: best_acc = valacc best_epoch = 0 #best_val = val_loss #if args.combine_train_dev: # best_acc = testacc #else: # best_acc = valacc if args.save_model: m.save(args.save_model) logging.info('saving best model') else: #if args.combine_train_dev: # valacc = testacc # #if best_acc < valacc: # early_stopping = 0 # if args.combine_train_dev: # best_acc = testacc # else: # best_acc = valacc if best_acc <= valacc: best_acc = valacc best_epoch = epoch if args.save_model: m.save(args.save_model) logging.info('re-saving best model') #else: # early_stopping+=1 logging.info('best model is at epoch no: %d', best_epoch) logging.info('\nbest model details are at epoch no: %d', best_epoch) #curr_time = time.time() #print ('time for rest junk: ', curr_time - prev_time) ''' if count%batch_size!=0: batch_loss = dy.esum(batch_loss_vec)/len(batch_loss_vec) batch_loss.forward() batch_loss.backward() trainer.update() batch_loss_vec=[] dy.renew_cg() ''' if args.plots: fig = plt.figure() plt.plot(iterations, losses) axes = plt.gca() axes.set_xlim([0, epochs]) axes.set_ylim([0, 10000]) fig.savefig('figs/loss_plot.png')
def train(network, train_data, dev_data, test_data, args): def get_val_set_acc(network, dev_data): evals = [ network.evaluate(input_sentences, labels) for i, (input_sentences, labels) in enumerate(dev_data) ] dy.renew_cg() loss = [l for l, p, c, t in evals] correct = [l for l, p, c, t in evals] total = [l for l, p, c, t in evals] return 100.0 * sum(correct) / sum(total), sum(loss) / len(dev_data) if args.optimizer == 'adadelta': trainer = dy.AdadeltaTrainer(network.model) trainer.set_clip_threshold(5) elif args.optimizer == 'adam': trainer = dy.AdamTrainer(network.model, alpha=args.lr) trainer.set_clip_threshold(5) elif args.optimizer == 'sgd-momentum': trainer = dy.MomentumSGDTrainer(network.model, learning_rate=args.lr) else: logging.critical('This Optimizer is not valid or not allowed') losses = [] iterations = [] batch_loss_vec = [] dy.renew_cg() is_best = 0 best_val = 0 count = 0 count_train = -1 for epoch in range(args.epochs): num_train = int(len(train_data) / args.batch_size + 1) * args.batch_size for ii in range(num_train): count_train += 1 if count_train == len(train_data): count_train = 0 count += 1 inputs, outputs = train_data[count_train] loss, pred_labels, correct, total = network.get_loss( inputs, outputs) batch_loss_vec.append(loss) if count % args.batch_size == 0: batch_loss = dy.esum(batch_loss_vec) / args.batch_size batch_loss.forward() batch_loss.backward() trainer.update() batch_loss_vec = [] dy.renew_cg() dev_acc, dev_loss = get_val_set_acc(network, dev_data) losses.append(dev_loss) iterations.append(epoch) test_acc, test_loss = get_val_set_acc(network, test_data) logging.info( 'epoch %d done, dev loss: %f, dev acc: %f, test loss: %f, test acc: %f', epoch, dev_loss, dev_acc, test_loss, test_acc) m = network.model if epoch == 0: best_val = dev_loss if args.save_model: m.save(args.save_model) logging.info('saving best model') else: if dev_loss < best_val: best_val = dev_loss if args.save_model: m.save(args.save_model) logging.info('re-saving best model') if count % args.batch_size != 0: batch_loss = dy.esum(batch_loss_vec) / len(batch_loss_vec) batch_loss.forward() batch_loss.backward() trainer.update() batch_loss_vec = [] dy.renew_cg() if args.plots: fig = plt.figure() plt.plot(iterations, losses) axes = plt.gca() axes.set_xlim([0, epochs]) axes.set_ylim([0, 10000]) fig.savefig('figs/loss_plot.png')
else: args.rnn = dynet.SimpleRNNBuilder BEGIN_TOKEN = '<s>' END_TOKEN = '<e>' # define model and obtain vocabulary # (reload vocab files is saved model or create new vocab files if new model) model = dynet.Model() if not args.trainer or args.trainer=="simple_sgd": trainer = dynet.SimpleSGDTrainer(model) elif args.trainer == "momentum_sgd": trainer = dynet.MomentumSGDTrainer(model) elif args.trainer == "adadelta": trainer = dynet.AdadeltaTrainer(model) elif args.trainer == "adagrad": trainer = dynet.AdagradTrainer(model) elif args.trainer == "adam": trainer = dynet.AdamTrainer(model) else: raise Exception("Trainer not recognized! Please use one of {simple_sgd, momentum_sgd, adadelta, adagrad, adam}") trainer.set_clip_threshold(-1.0) trainer.set_sparse_updates(True) # load corpus print "Loading corpus..." train_data = list(util.get_reader(args.reader_mode)(args.train, mode=args.reader_mode, begin=BEGIN_TOKEN, end=END_TOKEN)) if args.valid:
def train_model(model, embeddings_lookup, hidden_W, hidden_bias, MLP_W, MLP_bias, encoder_lstm, train_sents, train_labels, dev_sents, dev_labels, word2int): print 'training...' aggregated_loss = 0 trainer = dy.AdadeltaTrainer(model) train_len = len(train_sents) patience = 10 best_dev = 0 avg_loss = 0 for e in xrange(EPOCHS): start = time.time() print 'starting epoch {}'.format(e) # randomize the training set indices = range(train_len) random.shuffle(indices) train_set = zip(train_sents, train_labels) shuffled_train_set = [train_set[i] for i in indices] # compute loss for each example and update for i, example in enumerate(shuffled_train_set): sent, label = example loss = one_sent_loss(model, embeddings_lookup, hidden_W, hidden_bias, MLP_W, MLP_bias, encoder_lstm, sent, label, word2int) loss_value = loss.value() aggregated_loss += loss_value loss.backward() trainer.update() if i > 0: avg_loss = aggregated_loss / float(i + e * train_len) else: avg_loss = aggregated_loss if i % 10000 == 0: print 'epoch: {} avg. loss: {} went through {} examples'.format( e, avg_loss, i) # evaluate on dev after each epoch: dev_score = evaluate_model(model, embeddings_lookup, hidden_W, hidden_bias, MLP_W, MLP_bias, encoder_lstm, dev_sents, dev_labels, word2int) if dev_score < best_dev: patience += 1 else: patience = 0 best_dev = dev_score model.save('best_model.txt') print 'epoch: {} avg. loss: {} dev acc.: {} best dev acc.:{}'.format( e, avg_loss, dev_score, best_dev) end = time.time() print 'epoch took {} seconds'.format(end - start) if patience > 10: return
def train_model(model, encoder, decoder, params, train_inputs, train_outputs, dev_inputs, dev_outputs, y2int, int2y, epochs, optimization, results_file_path, plot, batch_size, eval_after): print 'training...' np.random.seed(17) random.seed(17) # sort training sentences by length in descending order train_data = zip(train_inputs, train_outputs) train_data.sort(key=lambda t: -len(t[0])) train_order = [ x * batch_size for x in range(len(train_data) / batch_size + 1) ] # sort dev sentences by length in descending order dev_batch_size = 1 dev_data = zip(dev_inputs, dev_outputs) dev_data.sort(key=lambda t: -len(t[0])) dev_order = [ x * dev_batch_size for x in range(len(dev_data) / dev_batch_size + 1) ] if optimization == 'ADAM': trainer = dn.AdamTrainer( model ) # lam=REGULARIZATION, alpha=LEARNING_RATE, beta_1=0.9, beta_2=0.999, eps=1e-8) elif optimization == 'MOMENTUM': trainer = dn.MomentumSGDTrainer(model) elif optimization == 'SGD': trainer = dn.SimpleSGDTrainer(model) elif optimization == 'ADAGRAD': trainer = dn.AdagradTrainer(model) elif optimization == 'ADADELTA': trainer = dn.AdadeltaTrainer(model) else: trainer = dn.SimpleSGDTrainer(model) trainer.set_clip_threshold(float(arguments['--grad-clip'])) seen_examples_count = 0 total_loss = 0 best_dev_epoch = 0 best_train_epoch = 0 patience = 0 train_len = len(train_outputs) dev_len = len(dev_inputs) avg_train_loss = -1 train_loss_patience = 0 train_loss_patience_threshold = 99999999 max_patience = int(arguments['--max-patience']) log_path = results_file_path + '_log.txt' start_epoch, checkpoints_x, train_loss_y, dev_loss_y, dev_accuracy_y = read_from_log( log_path) if len(train_loss_y) > 0: total_batches = checkpoints_x[-1] best_avg_train_loss = max(train_loss_y) best_dev_accuracy = max(dev_accuracy_y) best_dev_loss = max(dev_loss_y) else: total_batches = 0 best_avg_train_loss = 999999 best_dev_loss = 999999 best_dev_accuracy = 0 # progress bar init # noinspection PyArgumentList widgets = [progressbar.Bar('>'), ' ', progressbar.ETA()] train_progress_bar = progressbar.ProgressBar(widgets=widgets, maxval=epochs).start() for e in xrange(start_epoch, epochs): # shuffle the batch start indices in each epoch random.shuffle(train_order) batches_per_epoch = len(train_order) start = time.time() # go through batches for i, batch_start_index in enumerate(train_order, start=1): total_batches += 1 # get batch examples batch_inputs = [ x[0] for x in train_data[batch_start_index:batch_start_index + batch_size] ] batch_outputs = [ x[1] for x in train_data[batch_start_index:batch_start_index + batch_size] ] actual_batch_size = len(batch_inputs) # skip empty batches if actual_batch_size == 0 or len(batch_inputs[0]) == 0: continue # compute batch loss loss = compute_batch_loss(encoder, decoder, batch_inputs, batch_outputs, y2int) # forward pass total_loss += loss.scalar_value() loss.backward() # update parameters trainer.update() seen_examples_count += actual_batch_size # avg loss per sample avg_train_loss = total_loss / float(i * batch_size + e * train_len) # start patience counts only after 20 batches if avg_train_loss < best_avg_train_loss and total_batches > 20: best_avg_train_loss = avg_train_loss train_loss_patience = 0 else: train_loss_patience += 1 if train_loss_patience > train_loss_patience_threshold: print 'train loss patience exceeded: {}'.format( train_loss_patience) return model, params, e, best_train_epoch if total_batches % 100 == 0 and total_batches > 0: print 'epoch {}: {} batches out of {} ({} examples out of {}) total: {} batches, {} examples. avg \ loss per example: {}'.format(e, i, batches_per_epoch, i * batch_size, train_len, total_batches, total_batches * batch_size, avg_train_loss) # print sentences per second end = time.time() elapsed_seconds = end - start print '{} sentences per second'.format(seen_examples_count / elapsed_seconds) seen_examples_count = 0 start = time.time() # checkpoint if total_batches % eval_after == 0: print 'starting checkpoint evaluation' dev_bleu, dev_loss = checkpoint_eval( encoder, decoder, params, dev_batch_size, dev_data, dev_inputs, dev_len, dev_order, dev_outputs, int2y, y2int, results_file_path=results_file_path) log_to_file(log_path, e, total_batches, avg_train_loss, dev_loss, dev_bleu) save_model(model, results_file_path, total_batches, models_to_save=int(arguments['--models-to-save'])) if dev_bleu >= best_dev_accuracy: best_dev_accuracy = dev_bleu best_dev_epoch = e # save best model to disk save_best_model(model, results_file_path) print 'saved new best model' patience = 0 else: patience += 1 if dev_loss < best_dev_loss: best_dev_loss = dev_loss print 'epoch: {0} train loss: {1:.4f} dev loss: {2:.4f} dev bleu: {3:.4f} \ best dev bleu {4:.4f} (epoch {5}) patience = {6}'.format( e, avg_train_loss, dev_loss, dev_bleu, best_dev_accuracy, best_dev_epoch, patience) if patience == max_patience: print 'out of patience after {0} checkpoints'.format( str(e)) train_progress_bar.finish() if plot: plt.cla() print 'checkpoint patience exceeded' return model, params, e, best_train_epoch # plotting results from checkpoint evaluation if plot: train_loss_y.append(avg_train_loss) checkpoints_x.append(total_batches) dev_accuracy_y.append(dev_bleu) dev_loss_y.append(dev_loss) y_vals = [('train_loss', train_loss_y), ('dev loss', dev_loss_y), ('dev_bleu', dev_accuracy_y)] common.plot_to_file(y_vals, x_name='total batches', x_vals=checkpoints_x, file_path=results_file_path + '_learning_curve.png') # update progress bar after completing epoch train_progress_bar.update(e) # update progress bar after completing training train_progress_bar.finish() if plot: # clear plot when done plt.cla() print 'finished training. average loss: {} best epoch on dev: {} best epoch on train: {}'.format( str(avg_train_loss), best_dev_epoch, best_train_epoch) return model, params, e, best_train_epoch
meta.w2i = {} for w in wvm.vocab: meta.w2i[w] = wvm.vocab[w].index if args.save_model: pickle.dump(meta, open('%s.meta' % args.save_model, 'wb')) if args.load_model: ontoparser = SubsumptionLearning(model=args.load_model) else: ontoparser = SubsumptionLearning(meta=meta) trainers = { 'momsgd': dy.MomentumSGDTrainer(ontoparser.model, edecay=0.25), 'adam': dy.AdamTrainer(ontoparser.model, edecay=0.25), 'simsgd': dy.SimpleSGDTrainer(ontoparser.model, edecay=0.25), 'adagrad': dy.AdagradTrainer(ontoparser.model, edecay=0.25), 'adadelta': dy.AdadeltaTrainer(ontoparser.model, edecay=0.25) } trainer = trainers[args.trainer] nntraining(train_sents) if args.dev: accuracy = Test(inputGenDev) sys.stdout.write("Accuracy: {}%\n".format(accuracy)) if args.isDaemon and args.daemonPort: sys.stderr.write('Leastening at port %d\n' % args.daemonPort) host = "0.0.0.0" #Listen on all interfaces port = args.daemonPort #Port number tcpsock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) tcpsock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
plt.legend(handles=[blue_patch]) with open(os.path.join('..\processed', 'train_ix.pkl'), 'rb') as f: train_ix = pickle.load(f) if USE_UNLABELED: with open(os.path.join('..\processed', 'unlab_ix.pkl'), 'rb') as f: train_ix.extend(pickle.load(f)) with open(os.path.join('..\processed', 'valid_ix.pkl'), 'rb') as f: valid_ix = pickle.load(f) # initialize dynet parameters and learning algorithm params = dy.ParameterCollection() trainer = dy.AdadeltaTrainer(params) lm = SimpleNLM(params, vocab_size=VOCAB_SIZE, hidden_dim=HIDDEN_DIM) train_batches = make_batches(train_ix, batch_size=BATCH_SIZE) valid_batches = make_batches(valid_ix, batch_size=BATCH_SIZE) n_train_words = sum(len(sent) for _, sent in train_ix) n_valid_words = sum(len(sent) for _, sent in valid_ix) for it in range(MAX_EPOCHS): tic = clock() # iterate over all training batches, accumulate loss. total_loss = 0 for batch in train_batches: dy.renew_cg()
def __init__(self, args, vocabLengthSource, vocabLengthActionRule, vocabLengthNodes, vocabLengthTarget): self.flag_copy = True self.vocabLengthSource = vocabLengthSource self.vocabLengthActionRule = vocabLengthActionRule self.vocabLengthNodes = vocabLengthNodes self.vocabLengthTarget = vocabLengthTarget # parameters for the model self.numLayer = args.numLayer self.embeddingSourceSize = args.embeddingSourceSize self.embeddingApplySize = args.embeddingApplySize self.embeddingGenSize = args.embeddingGenSize self.embeddingNodeSize = args.embeddingNodeSize self.hiddenSize = args.hiddenSize self.attSize = args.attSize self.pointerSize = args.pointerSize self.dropout = args.dropout self.embeddingRuletypeSize = 2 self.learningRate= args.learningRate self.model = dy.ParameterCollection() #self.trainer = dy.AdamTrainer(self.model, alpha=self.learningRate) self.trainer = dy.AdadeltaTrainer(self.model) # source lookup self.sourceLookup = self.model.add_lookup_parameters((self.vocabLengthSource, self.embeddingSourceSize)) # action embeddging matrix self.actionRuleLookup = self.model.add_lookup_parameters((self.vocabLengthActionRule, self.embeddingApplySize)) # for node type lookup self.nodeTypeLookup = self.model.add_lookup_parameters((self.vocabLengthNodes, self.embeddingNodeSize)) # gor gen type lookup self.gentokenLookup = self.model.add_lookup_parameters((self.vocabLengthTarget, self.embeddingGenSize)) # adding paramteters to the AST Neural Network self.attentionSource = self.model.add_parameters((self.attSize, self.hiddenSize * 2)) self.attentionTarget = self.model.add_parameters((self.attSize, self.numLayer*self.hiddenSize)) self.attentionParameter = self.model.add_parameters((1, self.attSize)) self.w_selection_gen_softmax = self.model.add_parameters((2, self.hiddenSize)) self.w_out_rule = self.model.add_parameters((self.embeddingApplySize, self.hiddenSize)) # should change whe hidden layers increase self.b_out_rule = self.model.add_parameters((self.embeddingApplySize)) self.w_out_vocab = self.model.add_parameters((self.embeddingApplySize, self.hiddenSize + self.hiddenSize * 2)) # should change whe hidden layers increase self.b_out_vocab = self.model.add_parameters((self.embeddingApplySize)) self.w_pointer_hidden = self.model.add_parameters((self.pointerSize, 2*self.hiddenSize + 2*self.hiddenSize + self.hiddenSize)) self.b_pointer_hidden = self.model.add_parameters((self.pointerSize)) self.w_pointer_out = self.model.add_parameters((1, self.pointerSize)) self.b_pointer_out = self.model.add_parameters((1)) # initializing the encoder and decoder self.forward_encoder = dy.VanillaLSTMBuilder(self.numLayer, self.embeddingSourceSize, self.hiddenSize, self.model) self.backward_encoder = dy.VanillaLSTMBuilder(self.numLayer, self.embeddingSourceSize, self.hiddenSize, self.model) # check this # embedding size + (previous action embedding + context vector + node type mebedding + parnnet feeding ) # parent feeding - hidden states of parent action + embedding of parent action self.inputDecoderSize = self.embeddingApplySize + self.hiddenSize * 2 + self.hiddenSize + self.embeddingApplySize + self.embeddingNodeSize self.decoder = dy.VanillaLSTMBuilder(self.numLayer, self.inputDecoderSize, self.hiddenSize, self.model)
def train_model(model, char_lookup, feat_lookup, R, bias, encoder_frnn, encoder_rrnn, decoder_rnn, train_lemmas, train_feat_dicts, train_words, dev_lemmas, dev_feat_dicts, dev_words, alphabet_index, inverse_alphabet_index, epochs, optimization, results_file_path, train_aligned_pairs, dev_aligned_pairs, feat_index, feature_types, plot): print 'training...' np.random.seed(17) random.seed(17) if optimization == 'ADAM': trainer = pc.AdamTrainer(model, lam=REGULARIZATION, alpha=LEARNING_RATE, beta_1=0.9, beta_2=0.999, eps=1e-8) elif optimization == 'MOMENTUM': trainer = pc.MomentumSGDTrainer(model) elif optimization == 'SGD': trainer = pc.SimpleSGDTrainer(model) elif optimization == 'ADAGRAD': trainer = pc.AdagradTrainer(model) elif optimization == 'ADADELTA': trainer = pc.AdadeltaTrainer(model) else: trainer = pc.SimpleSGDTrainer(model) total_loss = 0 best_avg_dev_loss = 999 best_dev_accuracy = -1 best_train_accuracy = -1 patience = 0 train_len = len(train_words) sanity_set_size = 100 epochs_x = [] train_loss_y = [] dev_loss_y = [] train_accuracy_y = [] dev_accuracy_y = [] e = -1 # progress bar init widgets = [progressbar.Bar('>'), ' ', progressbar.ETA()] train_progress_bar = progressbar.ProgressBar(widgets=widgets, maxval=epochs).start() avg_loss = -1 for e in xrange(epochs): # randomize the training set indices = range(train_len) random.shuffle(indices) train_set = zip(train_lemmas, train_feat_dicts, train_words, train_aligned_pairs) train_set = [train_set[i] for i in indices] # compute loss for each example and update for i, example in enumerate(train_set): lemma, feats, word, alignment = example loss = one_word_loss(model, char_lookup, feat_lookup, R, bias, encoder_frnn, encoder_rrnn, decoder_rnn, lemma, feats, word, alphabet_index, alignment, feat_index, feature_types) loss_value = loss.value() total_loss += loss_value loss.backward() trainer.update() if i > 0: avg_loss = total_loss / float(i + e * train_len) else: avg_loss = total_loss if EARLY_STOPPING: # get train accuracy print 'evaluating on train...' train_predictions = predict_sequences(model, char_lookup, feat_lookup, R, bias, encoder_frnn, encoder_rrnn, decoder_rnn, alphabet_index, inverse_alphabet_index, train_lemmas[:sanity_set_size], train_feat_dicts[:sanity_set_size], feat_index, feature_types) train_accuracy = evaluate_model(train_predictions, train_lemmas[:sanity_set_size], train_feat_dicts[:sanity_set_size], train_words[:sanity_set_size], feature_types, print_results=False)[1] if train_accuracy > best_train_accuracy: best_train_accuracy = train_accuracy dev_accuracy = 0 avg_dev_loss = 0 if len(dev_lemmas) > 0: # get dev accuracy dev_predictions = predict_sequences(model, char_lookup, feat_lookup, R, bias, encoder_frnn, encoder_rrnn, decoder_rnn, alphabet_index, inverse_alphabet_index, dev_lemmas, dev_feat_dicts, feat_index, feature_types) print 'evaluating on dev...' # get dev accuracy dev_accuracy = evaluate_model(dev_predictions, dev_lemmas, dev_feat_dicts, dev_words, feature_types, print_results=True)[1] if dev_accuracy > best_dev_accuracy: best_dev_accuracy = dev_accuracy # save best model to disk save_pycnn_model(model, results_file_path) print 'saved new best model' patience = 0 else: patience += 1 # found "perfect" model if dev_accuracy == 1: train_progress_bar.finish() if plot: plt.cla() return model, e # get dev loss total_dev_loss = 0 for i in xrange(len(dev_lemmas)): total_dev_loss += one_word_loss(model, char_lookup, feat_lookup, R, bias, encoder_frnn, encoder_rrnn, decoder_rnn, dev_lemmas[i], dev_feat_dicts[i], dev_words[i], alphabet_index, dev_aligned_pairs[i], feat_index, feature_types).value() avg_dev_loss = total_dev_loss / float(len(dev_lemmas)) if avg_dev_loss < best_avg_dev_loss: best_avg_dev_loss = avg_dev_loss print 'epoch: {0} train loss: {1:.4f} dev loss: {2:.4f} dev accuracy: {3:.4f} train accuracy = {4:.4f} \ best dev accuracy {5:.4f} best train accuracy: {6:.4f} patience = {7}'.format(e, avg_loss, avg_dev_loss, dev_accuracy, train_accuracy, best_dev_accuracy, best_train_accuracy, patience) log_to_file(results_file_path + '_log.txt', e, avg_loss, train_accuracy, dev_accuracy) if patience == MAX_PATIENCE: print 'out of patience after {0} epochs'.format(str(e)) # TODO: would like to return best model but pycnn has a bug with save and load. Maybe copy via code? # return best_model[0] train_progress_bar.finish() if plot: plt.cla() return model, e else: # if no dev set is present, optimize on train set print 'no dev set for early stopping, running all epochs until perfectly fitting or patience was \ reached on the train set' if train_accuracy > best_train_accuracy: best_train_accuracy = train_accuracy # save best model to disk save_pycnn_model(model, results_file_path) print 'saved new best model' patience = 0 else: patience += 1 print 'epoch: {0} train loss: {1:.4f} train accuracy = {2:.4f} best train accuracy: {3:.4f} \ patience = {4}'.format(e, avg_loss, train_accuracy, best_train_accuracy, patience) # found "perfect" model on train set or patience has reached if train_accuracy == 1 or patience == MAX_PATIENCE: train_progress_bar.finish() if plot: plt.cla() return model, e # update lists for plotting train_accuracy_y.append(train_accuracy) epochs_x.append(e) train_loss_y.append(avg_loss) dev_loss_y.append(avg_dev_loss) dev_accuracy_y.append(dev_accuracy) # finished epoch train_progress_bar.update(e) if plot: with plt.style.context('fivethirtyeight'): p1, = plt.plot(epochs_x, dev_loss_y, label='dev loss') p2, = plt.plot(epochs_x, train_loss_y, label='train loss') p3, = plt.plot(epochs_x, dev_accuracy_y, label='dev acc.') p4, = plt.plot(epochs_x, train_accuracy_y, label='train acc.') plt.legend(loc='upper left', handles=[p1, p2, p3, p4]) plt.savefig(results_file_path + '.png') train_progress_bar.finish() if plot: plt.cla() print 'finished training. average loss: ' + str(avg_loss) return model, e
def __init__(self, exp_global=Ref(Path("exp_global")), eps=1e-6, rho=0.95): self.optimizer = dy.AdadeltaTrainer( exp_global.dynet_param_collection.param_col, eps, rho)
def train(self, fdir): trainer = dy.AdadeltaTrainer(self.model) best_acc, repeat = 0.0, 0 batch = 40 for epoch in range(60): dy.renew_cg() losses = [] closs = 0.0 for i, traininst in enumerate(self.trainset['refex']): pre_context = self.trainset['pre_context'][i] pos_context = self.trainset['pos_context'][i] refex = self.trainset['refex'][i] entity = self.trainset['entity'][i] loss = self.get_loss(pre_context, pos_context, refex, entity) losses.append(loss) if len(losses) == batch: loss = dy.esum(losses) closs += loss.value() loss.backward() trainer.update() dy.renew_cg() print("Epoch: {0} \t Loss: {1}".format(epoch, (closs / batch)), end=' \r') losses = [] closs = 0.0 outputs, num, dem = self.validate() acc = round(float(num) / dem, 2) print("Dev acc: {0} \t Best acc: {1}".format(str(num/dem), best_acc)) # Saving the model with best accuracy if best_acc == 0.0 or acc > best_acc: best_acc = acc fresults = os.path.join(fdir, 'results') if not os.path.exists(fresults): os.mkdir(fresults) fname = 'dev_best_' + \ str(self.LSTM_NUM_OF_LAYERS) + '_' + \ str(self.EMBEDDINGS_SIZE) + '_' + \ str(self.STATE_SIZE) + '_' + \ str(self.ATTENTION_SIZE) + '_' + \ str(self.DROPOUT).split('.')[1] + '_' + \ str(self.character) + '_' + \ str(self.BEAM) self.write(os.path.join(fresults, fname), outputs) fmodels = os.path.join(fdir, 'models') if not os.path.exists(fmodels): os.mkdir(fmodels) fname = 'best_' + \ str(self.LSTM_NUM_OF_LAYERS) + '_' + \ str(self.EMBEDDINGS_SIZE) + '_' + \ str(self.STATE_SIZE) + '_' + \ str(self.ATTENTION_SIZE) + '_' + \ str(self.DROPOUT).split('.')[1] + '_' + \ str(self.character) + '_' + \ str(self.BEAM) self.model.save(os.path.join(fmodels, fname)) repeat = 0 else: repeat += 1 # In case the accuracy does not increase in 20 epochs, break the process if repeat == 20: break fmodels = os.path.join(fdir, 'models') fname = str(self.LSTM_NUM_OF_LAYERS) + '_' + \ str(self.EMBEDDINGS_SIZE) + '_' + \ str(self.STATE_SIZE) + '_' + \ str(self.ATTENTION_SIZE) + '_' + \ str(self.DROPOUT).split('.')[1] + '_' + \ str(self.character) + '_' + \ str(self.BEAM) self.model.save(os.path.join(fmodels, fname))
def train(self): trainer = dy.AdadeltaTrainer(self.model) epoch_timing = [] early = 0.0 best_acc = 0.0 f = open('logging.txt', 'w') for epoch in range(self.EPOCH): print('\n') dy.renew_cg() losses = [] closs = 0 batch_timing = [] for i, trainrow in enumerate(self.trainset): start = time.time() question = trainrow['question'] answer = trainrow['answer'] image = self.id2img[trainrow['face_id']] loss = self.get_loss(image, question, answer) losses.append(loss) end = time.time() t = (end - start) batch_timing.append(t) epoch_timing.append(t) if len(losses) == self.BATCH: loss = dy.esum(losses) _loss = loss.value() closs += _loss loss.backward() trainer.update() dy.renew_cg() # percentage of trainset processed percentage = str( round((float(i + 1) / len(self.trainset)) * 100, 2)) + '%' # time of epoch processing time_epoch = sum(epoch_timing) if time_epoch > 3600: time_epoch = str(round(time_epoch / 3600, 2)) + ' h' elif time_epoch > 60: time_epoch = str(round(time_epoch / 60, 2)) + ' min' else: time_epoch = str(round(time_epoch, 2)) + ' sec' print( "Epoch: {0} \t\t Loss: {1} \t\t Epoch time: {2} \t\t Trainset: {3}" .format(epoch + 1, round(_loss, 2), time_epoch, percentage), end=' \r') losses = [] batch_timing = [] print("\nEpoch: {0} \t\t Total Loss / Batch: {1}".format( epoch + 1, round(closs / self.BATCH, 2))) acc = self.validate() print("\nEpoch: {0} \t\t Dev acc: {1} \t\t Best acc: {2}".format( epoch + 1, round(acc, 2), round(best_acc, 2))) f.write("Epoch: {0} \t\t Dev acc: {1} \t\t Best acc: {2}\n".format( epoch + 1, round(acc, 2), round(best_acc, 2))) if acc > best_acc: best_acc = acc early = 0 else: early += 1 if early == 50: break epoch_timing = [] f.close()