def score(source_data: str, target_data: str, load_from: str, corpus_average: bool, normalize: float = 0.6, **kwargs): """Scores a text using a trained translation model. See argument description in `bin/daikon`.""" # fix batch size at 1 to get individual scores for sentences batch_size = 1 source_vocab = Vocabulary() target_vocab = Vocabulary() source_vocab.load(os.path.join(load_from, C.SOURCE_VOCAB_FILENAME)) target_vocab.load(os.path.join(load_from, C.TARGET_VOCAB_FILENAME)) reader_ids = list( reader.read_parallel(source_data, target_data, source_vocab, target_vocab, C.SCORE_MAX_LEN)) encoder_inputs, decoder_targets, decoder_inputs, loss, _, _, _ = define_computation_graph( source_vocab.size, target_vocab.size, batch_size) saver = tf.train.Saver() with tf.Session() as session: # load model saver.restore(session, os.path.join(load_from, C.MODEL_FILENAME)) losses = [] total_iter = 0 for x, y, z in reader.iterate(reader_ids, batch_size, shuffle=False): feed_dict = { encoder_inputs: x, decoder_inputs: y, decoder_targets: z } l = session.run([loss], feed_dict=feed_dict) # first session variable l = l[0] if normalize: # normalize by length of target sequence (including EOS token) l /= np.power(y.shape[1], normalize) losses.append(l) total_iter += 1 if corpus_average: total_loss = np.sum(losses) perplexity = np.exp(total_loss / total_iter) return perplexity else: return np.exp(losses)
def _early_stopping(val_reader_ids, source_vocab, target_vocab, batch_size): val_graph_components = define_computation_graph(source_vocab.size, target_vocab.size, batch_size) val_encoder_inputs, val_decoder_targets, val_decoder_inputs, val_loss, val_train_step, val_decoder_logits, val_summary = val_graph_components patience = 3 val_total_loss = 0 val_epoch = 0 best_val_perplexity = float() no_imp_count = 0 with tf.Session() as session: for x, y, z in reader.iterate(val_reader_ids, batch_size, shuffle=True): val_feed_dict = { val_encoder_inputs: x, val_decoder_inputs: y, val_decoder_targets: z } l, _, s = session.run([val_loss, val_train_step, val_summary], val_feed_dict=val_feed_dict) val_total_loss += l val_epoch += 1 current_val_perplexity = np.exp(val_total_loss / val_epoch) logger.info("Perplexity on validation data: %.2f", current_val_perplexity) if current_val_perplexity < best_val_perplexity: logger.info("Lowest perplexity on validation data achieved") best_val_perplexity = current_val_perplexity no_imp_count = 0 else: global save_model save_model = False no_imp_count += 1 if no_imp_count >= patience: logger.info( "Stopped improving on validation data for %d epochs: terminating training", no_imp_count)
def score(source_data: str, target_data: str, load_from: str, corpus_average: bool, normalize: bool, **kwargs): """Scores a text using a trained translation model. See argument description in `bin/daikon`.""" # fix batch size at 1 to get individual scores for sentences batch_size = 1 source_vocab = Vocabulary() target_vocab = Vocabulary() source_vocab.load(os.path.join(load_from, C.SOURCE_VOCAB_FILENAME)) target_vocab.load(os.path.join(load_from, C.TARGET_VOCAB_FILENAME)) reader_ids = list(reader.read_parallel(source_data, target_data, source_vocab, target_vocab, C.SCORE_MAX_LEN)) encoder_inputs, decoder_targets, decoder_inputs, loss, _, _, _ = define_computation_graph(source_vocab.size, target_vocab.size, batch_size) saver = tf.train.Saver() with tf.Session() as session: # load model saver.restore(session, os.path.join(load_from, C.MODEL_FILENAME)) losses = [] total_iter = 0 for x, y, z in reader.iterate(reader_ids, batch_size, shuffle=False): feed_dict = {encoder_inputs: x, decoder_inputs: y, decoder_targets: z} l = session.run([loss], feed_dict=feed_dict) # first session variable l = l[0] if normalize: # normalize by length of target sequence (including EOS token) l /= y.shape[1] losses.append(l) total_iter += 1 if corpus_average: total_loss = np.sum(losses) perplexity = np.exp(total_loss / total_iter) return perplexity else: return np.exp(losses)
def train(source_data: str, target_data: str, epochs: int, batch_size: int, source_vocab_max_size: int, target_vocab_max_size: int, save_to: str, log_to: str, sample_after_epoch: bool, **kwargs) -> None: """Trains a language model. See argument description in `bin/romanesco`.""" # create folders for model and logs if they don't exist yet for folder in [save_to, log_to]: if not os.path.exists(folder): os.makedirs(folder) logger.info("Creating vocabularies.") # create vocabulary to map words to ids, for source and target source_vocab = create_vocab(source_data, source_vocab_max_size, save_to, C.SOURCE_VOCAB_FILENAME) target_vocab = create_vocab(target_data, target_vocab_max_size, save_to, C.TARGET_VOCAB_FILENAME) logger.info("Source vocabulary: %s", source_vocab) logger.info("Target vocabulary: %s", target_vocab) # convert training data to list of word ids logger.info("Reading training data.") reader_ids = list(reader.read_parallel(source_data, target_data, source_vocab, target_vocab, C.MAX_LEN)) # define computation graph logger.info("Building computation graph.") graph_components = define_computation_graph(source_vocab.size, target_vocab.size, batch_size) encoder_inputs, decoder_targets, decoder_inputs, loss, train_step, decoder_logits, summary = graph_components saver = tf.train.Saver() with tf.Session() as session: # init session.run(tf.global_variables_initializer()) # write logs (@tensorboard) summary_writer = tf.summary.FileWriter(log_to, graph=tf.get_default_graph()) logger.info("Starting training.") tic = time.time() num_batches = math.floor(len(reader_ids) / batch_size) prev_perplexity = 10000.0 counter = 0 # iterate over training data `epochs` times for epoch in range(1, epochs + 1): total_loss = 0.0 total_iter = 0 iter_tic = time.time() for x, y, z in reader.iterate(reader_ids, batch_size, shuffle=True): feed_dict = {encoder_inputs: x, decoder_inputs: y, decoder_targets: z} l, _, s = session.run([loss, train_step, summary], feed_dict=feed_dict) summary_writer.add_summary(s, total_iter) total_loss += l total_iter += 1 if total_iter % C.LOGGING_INTERVAL == 0 or total_iter == num_batches: iter_taken = time.time() - iter_tic logger.debug("Epoch=%s, iteration=%s/%s, samples/second=%.2f", epoch, total_iter, num_batches, batch_size * C.LOGGING_INTERVAL / float(iter_taken)) iter_tic = time.time() perplexity = np.exp(total_loss / total_iter) logger.info("Perplexity on training data after epoch %s: %.2f", epoch, perplexity) saver.save(session, os.path.join(save_to, C.MODEL_FILENAME)) if sample_after_epoch: # sample from model after epoch thread = threading.Thread(target=_sample_after_epoch, args=[reader_ids, source_vocab, target_vocab, save_to, epoch]) thread.start() # early stopping if perplexity > prev_perplexity: counter += 1 else: if counter != 0: counter -= 1 if counter == 2: taken = time.time() - tic m, s = divmod(taken, 60) h, m = divmod(m, 60) logger.info("Training finished early. Overall time taken to train: %d:%02d:%02d" % (h, m, s)) taken = time.time() - tic m, s = divmod(taken, 60) h, m = divmod(m, 60) logger.info("Training finished. Overall time taken to train: %d:%02d:%02d" % (h, m, s))
def train(source_data: str, target_data: str, epochs: int, batch_size: int, source_vocab_max_size: int, target_vocab_max_size: int, save_to: str, log_to: str, sample_after_epoch: bool, source_val_data: str = None, target_val_data: str = None, val_epochs: int = C.VAL_EPOCHS, patience: int = C.PATIENCE, overwrite: bool = False, **kwargs) -> None: """Trains a translation model. See argument description in `bin/daikon`.""" # enable early stopping if validation data files were specified early_stopping = source_val_data is not None and target_val_data is not None # create folders for model and logs if they don't exist yet for folder in [save_to, log_to]: if not os.path.exists(folder): os.makedirs(folder) if early_stopping: val_model_dir = os.path.join(save_to, C.VALIDATION_MODEL_DIR) if not os.path.exists(val_model_dir): os.makedirs(val_model_dir) # create a new graph if the overwrite option is enabled or there is no # existing model in the save_to directory checkpoint_file = os.path.join(save_to, C.MODEL_CHECKPOINT) initialize_graph = overwrite or not os.path.exists(checkpoint_file) source_vocab = Vocabulary() target_vocab = Vocabulary() if not initialize_graph: # load existing vocabulary that maps words to ids, for source and target logger.info("Loading vocabularies.") source_vocab.load(os.path.join(save_to, C.SOURCE_VOCAB_FILENAME)) target_vocab.load(os.path.join(save_to, C.TARGET_VOCAB_FILENAME)) else: # create vocabulary to map words to ids, for source and target logger.info("Creating vocabularies.") source_vocab = create_vocab(source_data, source_vocab_max_size, save_to, C.SOURCE_VOCAB_FILENAME) target_vocab = create_vocab(target_data, target_vocab_max_size, save_to, C.TARGET_VOCAB_FILENAME) logger.info("Source vocabulary: %s", source_vocab) logger.info("Target vocabulary: %s", target_vocab) if early_stopping: # create copies of vocabulary files used for checking validation # data performance source_vocab.save(os.path.join(val_model_dir, C.SOURCE_VOCAB_FILENAME)) target_vocab.save(os.path.join(val_model_dir, C.TARGET_VOCAB_FILENAME)) # convert training data to list of word ids logger.info("Reading training data.") reader_ids = list( reader.read_parallel(source_data, target_data, source_vocab, target_vocab, C.MAX_LEN)) # define computation graph logger.info("Building computation graph.") graph_components = define_computation_graph(source_vocab.size, target_vocab.size, batch_size) encoder_inputs, decoder_targets, decoder_inputs, loss, train_step, decoder_logits, summary = graph_components saver = tf.train.Saver() with tf.Session() as session: if initialize_graph: # init session.run(tf.global_variables_initializer()) else: # load/restore model for further training saver.restore(session, os.path.join(save_to, C.MODEL_FILENAME)) # write logs (@tensorboard) summary_writer = tf.summary.FileWriter(log_to, graph=tf.get_default_graph()) logger.info("Starting training.") tic = time.time() num_batches = math.floor(len(reader_ids) / batch_size) if early_stopping: # initialize metrics for checking validation data performance best_val_loss = float("inf") epochs_without_improvement = 0 # iterate over training data `epochs` times for epoch in range(1, epochs + 1): total_loss = 0.0 total_iter = 0 iter_tic = time.time() for x, y, z in reader.iterate(reader_ids, batch_size, shuffle=True): feed_dict = { encoder_inputs: x, decoder_inputs: y, decoder_targets: z } l, _, s = session.run([loss, train_step, summary], feed_dict=feed_dict) summary_writer.add_summary(s, total_iter) total_loss += l total_iter += 1 if total_iter % C.LOGGING_INTERVAL == 0 or total_iter == num_batches: iter_taken = time.time() - iter_tic logger.debug( "Epoch=%s, iteration=%s/%s, samples/second=%.2f", epoch, total_iter, num_batches, batch_size * C.LOGGING_INTERVAL / float(iter_taken)) iter_tic = time.time() perplexity = np.exp(total_loss / total_iter) logger.info("Perplexity on training data after epoch %s: %.2f", epoch, perplexity) save_model = True if early_stopping and epoch % val_epochs == 0: # save a copy of the current model that can be used to check # its performance for the validation data saver.save(session, os.path.join(val_model_dir, C.MODEL_FILENAME)) # spin off a thread to call score() for the validation data threadPool = ThreadPool(processes=1) scoreRes = threadPool.apply_async( score, (source_val_data, target_val_data, val_model_dir, True, False)) latest_val_loss = scoreRes.get() logging.info( "Current model perplexity on validation data: %.2f", latest_val_loss) if latest_val_loss < best_val_loss: logging.info( "Lowest perplexity on validation data achieved") best_val_loss = latest_val_loss epochs_without_improvement = 0 else: save_model = False epochs_without_improvement += 1 if epochs_without_improvement >= patience: logging.info( "No improvement in validation data perplexity for %d epochs: terminating training", epochs_without_improvement) return if save_model: saver.save(session, os.path.join(save_to, C.MODEL_FILENAME)) if sample_after_epoch: # sample from model after epoch thread = threading.Thread(target=_sample_after_epoch, args=[ reader_ids, source_vocab, target_vocab, save_to, epoch ]) thread.start() taken = time.time() - tic m, s = divmod(taken, 60) h, m = divmod(m, 60) logger.info( "Training finished. Overall time taken to train: %d:%02d:%02d" % (h, m, s))
def train(source_data: str, target_data: str, epochs: int, batch_size: int, vocab_max_size: int, save_to: str, log_to: str, sample_after_epoch: bool, **kwargs) -> None: """Trains a language model. See argument description in `bin/romanesco`.""" # create folders for model and logs if they don't exist yet for folder in [save_to, log_to]: if not os.path.exists(folder): os.makedirs(folder) # create vocabulary to map words to ids, for source and target source_vocab = create_vocab(source_data, vocab_max_size, save_to, C.SOURCE_VOCAB_FILENAME) target_vocab = create_vocab(target_data, vocab_max_size, save_to, C.TARGET_VOCAB_FILENAME) # convert training data to list of word ids reader_ids = list( reader.read_parallel(source_data, target_data, source_vocab, target_vocab, C.MAX_LEN)) # define computation graph logging.info("Building computation graph.") graph_components = define_computation_graph(source_vocab.size, target_vocab.size, batch_size) encoder_inputs, decoder_targets, decoder_inputs, loss, train_step, decoder_logits, summary = graph_components saver = tf.train.Saver() with tf.Session() as session: # init session.run(tf.global_variables_initializer()) # write logs (@tensorboard) summary_writer = tf.summary.FileWriter(log_to, graph=tf.get_default_graph()) logging.info("Starting training.") # iterate over training data `epochs` times for epoch in range(1, epochs + 1): total_loss = 0.0 total_iter = 0 for x, y, z in reader.iterate(reader_ids, batch_size, shuffle=True): feed_dict = { encoder_inputs: x, decoder_inputs: y, decoder_targets: z } l, _, s = session.run([loss, train_step, summary], feed_dict=feed_dict) summary_writer.add_summary(s, total_iter) total_loss += l total_iter += 1 if total_iter % 100 == 0: logging.debug("Epoch=%s, iteration=%s", epoch, total_iter) perplexity = np.exp(total_loss / total_iter) logging.info("Perplexity on training data after epoch %s: %.2f", epoch, perplexity) saver.save(session, os.path.join(save_to, C.MODEL_FILENAME)) if sample_after_epoch: # sample from model after epoch thread = threading.Thread(target=_sample_after_epoch, args=[ reader_ids, source_vocab, target_vocab, save_to, epoch ]) thread.start() logging.info("Training finished.")
def train(source_data: str, target_data: str, epochs: int, batch_size: int, source_vocab_max_size: int, target_vocab_max_size: int, save_to: str, log_to: str, sample_after_epoch: bool, **kwargs) -> None: """Trains a language model. See argument description in `bin/romanesco`.""" # create folders for model and logs if they don't exist yet for folder in [save_to, log_to]: if not os.path.exists(folder): os.makedirs(folder) logger.info("Creating vocabularies.") # create vocabulary to map words to ids, for source and target source_vocab = create_vocab(source_data, source_vocab_max_size, save_to, C.SOURCE_VOCAB_FILENAME) target_vocab = create_vocab(target_data, target_vocab_max_size, save_to, C.TARGET_VOCAB_FILENAME) logger.info("Source vocabulary: %s", source_vocab) logger.info("Target vocabulary: %s", target_vocab) # convert training data to list of word ids logger.info("Reading training data.") reader_ids = list(reader.read_parallel(source_data, target_data, source_vocab, target_vocab, C.MAX_LEN)) # define computation graph logger.info("Building computation graph.") graph_components = define_computation_graph(source_vocab.size, target_vocab.size, batch_size) encoder_inputs, decoder_targets, decoder_inputs, loss, train_step, decoder_logits, summary = graph_components saver = tf.train.Saver() with tf.Session() as session: # init session.run(tf.global_variables_initializer()) # write logs (@tensorboard) summary_writer = tf.summary.FileWriter(log_to, graph=tf.get_default_graph()) logger.info("Starting training.") tic = time.time() num_batches = math.floor(len(reader_ids) / batch_size) # iterate over training data `epochs` times for epoch in range(1, epochs + 1): total_loss = 0.0 total_iter = 0 iter_tic = time.time() for x, y, z in reader.iterate(reader_ids, batch_size, shuffle=True): feed_dict = {encoder_inputs: x, decoder_inputs: y, decoder_targets: z} l, _, s = session.run([loss, train_step, summary], feed_dict=feed_dict) summary_writer.add_summary(s, total_iter) total_loss += l total_iter += 1 if total_iter % C.LOGGING_INTERVAL == 0 or total_iter == num_batches: iter_taken = time.time() - iter_tic logger.debug("Epoch=%s, iteration=%s/%s, samples/second=%.2f", epoch, total_iter, num_batches, batch_size * C.LOGGING_INTERVAL / float(iter_taken)) iter_tic = time.time() perplexity = np.exp(total_loss / total_iter) logger.info("Perplexity on training data after epoch %s: %.2f", epoch, perplexity) saver.save(session, os.path.join(save_to, C.MODEL_FILENAME)) if sample_after_epoch: # sample from model after epoch thread = threading.Thread(target=_sample_after_epoch, args=[reader_ids, source_vocab, target_vocab, save_to, epoch]) thread.start() taken = time.time() - tic m, s = divmod(taken, 60) h, m = divmod(m, 60) logger.info("Training finished. Overall time taken to train: %d:%02d:%02d" % (h, m, s))