def _sample_after_epoch(reader_ids: List[reader.ReaderTuple], source_vocab: Vocabulary, target_vocab: Vocabulary, load_from: str, epoch: int) -> None: """ Samples translations during training. Three sentences are picked at random, translated with the current model and logged. """ input_lines, output_lines = zip(*random.sample(reader_ids, 3)) input_lines = [ " ".join(source_vocab.get_words(input_line)) for input_line in input_lines ] output_lines = [ " ".join(target_vocab.get_words(output_line)) for output_line in output_lines ] translations = translate_lines(load_from=load_from, input_lines=input_lines, train_mode=True) logger.debug("Sampled translations after epoch %s.", epoch) for input_line, output_line, translation in zip(input_lines, output_lines, translations): logger.debug("-" * 30) logger.debug("Input:\t\t%s", input_line) logger.debug("Predicted output:\t%s", translation) logger.debug("Actual output:\t%s", output_line) logger.debug("-" * 30)
def translate_line(session: tf.Session, line: str, source_vocab: vocab.Vocabulary, target_vocab: vocab.Vocabulary, encoder_inputs: tf.Tensor, decoder_inputs: tf.Tensor, decoder_targets: tf.Tensor, decoder_logits: tf.Tensor) -> str: """ Translates one single input string. """ source_ids = np.array(source_vocab.get_ids(line.split())).reshape(1, -1) translated_ids = [] # type: List[int] for _ in range(C.TRANSLATION_MAX_LEN): # target ids will serve as decoder inputs and decoder targets, # but decoder targets will not be used to compute logits target_ids = np.array([C.BOS_ID] + translated_ids).reshape(1, -1) feed_dict = { encoder_inputs: source_ids, decoder_inputs: target_ids, decoder_targets: target_ids } logits_result = session.run([decoder_logits], feed_dict=feed_dict) # first session result, first item in batch, target symbol at last position next_symbol_logits = logits_result[0][0][-1] next_id = np.argmax(next_symbol_logits) # # get the the id with the highest logit while suppress the <unk> token # # get ids of the two items with highest value # ind_candidates = np.argpartition(next_symbol_logits, -2)[-2:] # ind_candidates = ind_candidates[np.argsort(next_symbol_logits[ind_candidates])] # sort candidates # if ind_candidates[-1] != C.UNK_ID: # next_id = ind_candidates[-1] # else: # next_id = ind_candidates[-2] # if next_id in [C.EOS_ID, C.PAD_ID]: break translated_ids.append(next_id) words = target_vocab.get_words(translated_ids) return ' '.join(words)
def translate_line(session: tf.Session, line: str, source_vocab: vocab.Vocabulary, target_vocab: vocab.Vocabulary, encoder_inputs: tf.Tensor, decoder_inputs: tf.Tensor, decoder_targets: tf.Tensor, decoder_logits: tf.Tensor) -> str: """ Translates one single input string. """ source_ids = np.array(source_vocab.get_ids(line.split())).reshape(1, -1) translated_ids = [] # type: List[int] for _ in range(C.TRANSLATION_MAX_LEN): # target ids will serve as decoder inputs and decoder targets, # but decoder targets will not be used to compute logits target_ids = np.array([C.BOS_ID] + translated_ids).reshape(1, -1) feed_dict = {encoder_inputs: source_ids, decoder_inputs: target_ids, decoder_targets: target_ids} logits_result = session.run([decoder_logits], feed_dict=feed_dict) # first session result, first item in batch, target symbol at last position next_symbol_logits = logits_result[0][0][-1] next_id = np.argmax(next_symbol_logits) if next_id in [C.EOS_ID, C.PAD_ID]: break if next_id == C.UNK_ID: max = 0 for number in next_symbol_logits: if number> max: max = number else: pass next_id = max translated_ids.append(next_id) words = target_vocab.get_words(translated_ids) return ' '.join(words)
def translate_line(session: tf.Session, line: str, source_vocab: vocab.Vocabulary, target_vocab: vocab.Vocabulary, encoder_inputs: tf.Tensor, decoder_inputs: tf.Tensor, decoder_targets: tf.Tensor, decoder_logits: tf.Tensor) -> str: """ Translates one single input string. """ source_ids = np.array(source_vocab.get_ids(line.split())).reshape(1, -1) translated_ids = [] # type: List[int] for _ in range(C.TRANSLATION_MAX_LEN): # target ids will serve as decoder inputs and decoder targets, # but decoder targets will not be used to compute logits target_ids = np.array([C.BOS_ID] + translated_ids).reshape(1, -1) feed_dict = { encoder_inputs: source_ids, decoder_inputs: target_ids, decoder_targets: target_ids } logits_result = session.run([decoder_logits], feed_dict=feed_dict) # first session result, first item in batch, target symbol at last position next_symbol_logits = logits_result[0][0][-1] # Original token selection with argmax next_id = np.argmax(next_symbol_logits) ####### if np.argmax == unknown word ID, get 2nd argmax value. if next_id == 0: next_id = np.argsort(next_symbol_logits)[-2] if next_id in [C.EOS_ID, C.PAD_ID]: break translated_ids.append(next_id) words = target_vocab.get_words(translated_ids) return ' '.join(words)
def score(source_data: str, target_data: str, load_from: str, corpus_average: bool, normalize: float = 0.6, **kwargs): """Scores a text using a trained translation model. See argument description in `bin/daikon`.""" # fix batch size at 1 to get individual scores for sentences batch_size = 1 source_vocab = Vocabulary() target_vocab = Vocabulary() source_vocab.load(os.path.join(load_from, C.SOURCE_VOCAB_FILENAME)) target_vocab.load(os.path.join(load_from, C.TARGET_VOCAB_FILENAME)) reader_ids = list( reader.read_parallel(source_data, target_data, source_vocab, target_vocab, C.SCORE_MAX_LEN)) encoder_inputs, decoder_targets, decoder_inputs, loss, _, _, _ = define_computation_graph( source_vocab.size, target_vocab.size, batch_size) saver = tf.train.Saver() with tf.Session() as session: # load model saver.restore(session, os.path.join(load_from, C.MODEL_FILENAME)) losses = [] total_iter = 0 for x, y, z in reader.iterate(reader_ids, batch_size, shuffle=False): feed_dict = { encoder_inputs: x, decoder_inputs: y, decoder_targets: z } l = session.run([loss], feed_dict=feed_dict) # first session variable l = l[0] if normalize: # normalize by length of target sequence (including EOS token) l /= np.power(y.shape[1], normalize) losses.append(l) total_iter += 1 if corpus_average: total_loss = np.sum(losses) perplexity = np.exp(total_loss / total_iter) return perplexity else: return np.exp(losses)
def _sample_after_epoch(reader_ids: List[reader.ReaderTuple], source_vocab: Vocabulary, target_vocab: Vocabulary, load_from: str, epoch: int) -> None: """ Samples translations during training. Three sentences are picked at random, translated with the current model and logged. """ input_lines, output_lines = zip(*random.sample(reader_ids, 3)) input_lines = [" ".join(source_vocab.get_words(input_line)) for input_line in input_lines] output_lines = [" ".join(target_vocab.get_words(output_line)) for output_line in output_lines] translations = translate_lines(load_from=load_from, input_lines=input_lines, train_mode=True) logger.debug("Sampled translations after epoch %s.", epoch) for input_line, output_line, translation in zip(input_lines, output_lines, translations): logger.debug("-" * 30) logger.debug("Input:\t\t%s", input_line) logger.debug("Predicted output:\t%s", translation) logger.debug("Actual output:\t%s", output_line) logger.debug("-" * 30)
def score(source_data: str, target_data: str, load_from: str, corpus_average: bool, normalize: bool, **kwargs): """Scores a text using a trained translation model. See argument description in `bin/daikon`.""" # fix batch size at 1 to get individual scores for sentences batch_size = 1 source_vocab = Vocabulary() target_vocab = Vocabulary() source_vocab.load(os.path.join(load_from, C.SOURCE_VOCAB_FILENAME)) target_vocab.load(os.path.join(load_from, C.TARGET_VOCAB_FILENAME)) reader_ids = list(reader.read_parallel(source_data, target_data, source_vocab, target_vocab, C.SCORE_MAX_LEN)) encoder_inputs, decoder_targets, decoder_inputs, loss, _, _, _ = define_computation_graph(source_vocab.size, target_vocab.size, batch_size) saver = tf.train.Saver() with tf.Session() as session: # load model saver.restore(session, os.path.join(load_from, C.MODEL_FILENAME)) losses = [] total_iter = 0 for x, y, z in reader.iterate(reader_ids, batch_size, shuffle=False): feed_dict = {encoder_inputs: x, decoder_inputs: y, decoder_targets: z} l = session.run([loss], feed_dict=feed_dict) # first session variable l = l[0] if normalize: # normalize by length of target sequence (including EOS token) l /= y.shape[1] losses.append(l) total_iter += 1 if corpus_average: total_loss = np.sum(losses) perplexity = np.exp(total_loss / total_iter) return perplexity else: return np.exp(losses)
def train(source_data: str, target_data: str, epochs: int, batch_size: int, source_vocab_max_size: int, target_vocab_max_size: int, save_to: str, log_to: str, sample_after_epoch: bool, source_val_data: str = None, target_val_data: str = None, val_epochs: int = C.VAL_EPOCHS, patience: int = C.PATIENCE, overwrite: bool = False, **kwargs) -> None: """Trains a translation model. See argument description in `bin/daikon`.""" # enable early stopping if validation data files were specified early_stopping = source_val_data is not None and target_val_data is not None # create folders for model and logs if they don't exist yet for folder in [save_to, log_to]: if not os.path.exists(folder): os.makedirs(folder) if early_stopping: val_model_dir = os.path.join(save_to, C.VALIDATION_MODEL_DIR) if not os.path.exists(val_model_dir): os.makedirs(val_model_dir) # create a new graph if the overwrite option is enabled or there is no # existing model in the save_to directory checkpoint_file = os.path.join(save_to, C.MODEL_CHECKPOINT) initialize_graph = overwrite or not os.path.exists(checkpoint_file) source_vocab = Vocabulary() target_vocab = Vocabulary() if not initialize_graph: # load existing vocabulary that maps words to ids, for source and target logger.info("Loading vocabularies.") source_vocab.load(os.path.join(save_to, C.SOURCE_VOCAB_FILENAME)) target_vocab.load(os.path.join(save_to, C.TARGET_VOCAB_FILENAME)) else: # create vocabulary to map words to ids, for source and target logger.info("Creating vocabularies.") source_vocab = create_vocab(source_data, source_vocab_max_size, save_to, C.SOURCE_VOCAB_FILENAME) target_vocab = create_vocab(target_data, target_vocab_max_size, save_to, C.TARGET_VOCAB_FILENAME) logger.info("Source vocabulary: %s", source_vocab) logger.info("Target vocabulary: %s", target_vocab) if early_stopping: # create copies of vocabulary files used for checking validation # data performance source_vocab.save(os.path.join(val_model_dir, C.SOURCE_VOCAB_FILENAME)) target_vocab.save(os.path.join(val_model_dir, C.TARGET_VOCAB_FILENAME)) # convert training data to list of word ids logger.info("Reading training data.") reader_ids = list( reader.read_parallel(source_data, target_data, source_vocab, target_vocab, C.MAX_LEN)) # define computation graph logger.info("Building computation graph.") graph_components = define_computation_graph(source_vocab.size, target_vocab.size, batch_size) encoder_inputs, decoder_targets, decoder_inputs, loss, train_step, decoder_logits, summary = graph_components saver = tf.train.Saver() with tf.Session() as session: if initialize_graph: # init session.run(tf.global_variables_initializer()) else: # load/restore model for further training saver.restore(session, os.path.join(save_to, C.MODEL_FILENAME)) # write logs (@tensorboard) summary_writer = tf.summary.FileWriter(log_to, graph=tf.get_default_graph()) logger.info("Starting training.") tic = time.time() num_batches = math.floor(len(reader_ids) / batch_size) if early_stopping: # initialize metrics for checking validation data performance best_val_loss = float("inf") epochs_without_improvement = 0 # iterate over training data `epochs` times for epoch in range(1, epochs + 1): total_loss = 0.0 total_iter = 0 iter_tic = time.time() for x, y, z in reader.iterate(reader_ids, batch_size, shuffle=True): feed_dict = { encoder_inputs: x, decoder_inputs: y, decoder_targets: z } l, _, s = session.run([loss, train_step, summary], feed_dict=feed_dict) summary_writer.add_summary(s, total_iter) total_loss += l total_iter += 1 if total_iter % C.LOGGING_INTERVAL == 0 or total_iter == num_batches: iter_taken = time.time() - iter_tic logger.debug( "Epoch=%s, iteration=%s/%s, samples/second=%.2f", epoch, total_iter, num_batches, batch_size * C.LOGGING_INTERVAL / float(iter_taken)) iter_tic = time.time() perplexity = np.exp(total_loss / total_iter) logger.info("Perplexity on training data after epoch %s: %.2f", epoch, perplexity) save_model = True if early_stopping and epoch % val_epochs == 0: # save a copy of the current model that can be used to check # its performance for the validation data saver.save(session, os.path.join(val_model_dir, C.MODEL_FILENAME)) # spin off a thread to call score() for the validation data threadPool = ThreadPool(processes=1) scoreRes = threadPool.apply_async( score, (source_val_data, target_val_data, val_model_dir, True, False)) latest_val_loss = scoreRes.get() logging.info( "Current model perplexity on validation data: %.2f", latest_val_loss) if latest_val_loss < best_val_loss: logging.info( "Lowest perplexity on validation data achieved") best_val_loss = latest_val_loss epochs_without_improvement = 0 else: save_model = False epochs_without_improvement += 1 if epochs_without_improvement >= patience: logging.info( "No improvement in validation data perplexity for %d epochs: terminating training", epochs_without_improvement) return if save_model: saver.save(session, os.path.join(save_to, C.MODEL_FILENAME)) if sample_after_epoch: # sample from model after epoch thread = threading.Thread(target=_sample_after_epoch, args=[ reader_ids, source_vocab, target_vocab, save_to, epoch ]) thread.start() taken = time.time() - tic m, s = divmod(taken, 60) h, m = divmod(m, 60) logger.info( "Training finished. Overall time taken to train: %d:%02d:%02d" % (h, m, s))
def translate_line(session: tf.Session, line: str, source_vocab: vocab.Vocabulary, target_vocab: vocab.Vocabulary, encoder_inputs: tf.Tensor, decoder_inputs: tf.Tensor, decoder_targets: tf.Tensor, decoder_logits: tf.Tensor) -> str: """ Translates one single input string. """ source_ids = np.array(source_vocab.get_ids(line.split())).reshape(1, -1) #print(line) # instead of one list, we have a dictionary of float : list pairs # the float is always equal to the probability of this sentence #~ translated_ids = [] # type: List[int] # num of beams k = 5 # new list of finished translations sent_dict = {} finished_sent_dict = {} for _ in range(C.TRANSLATION_MAX_LEN): # target ids will serve as decoder inputs and decoder targets, # but decoder targets will not be used to compute logits potential_sentences = {} if k == 0: break if len(sent_dict) == 0: target_ids = np.array([C.BOS_ID]).reshape(1, -1) feed_dict = { encoder_inputs: source_ids, decoder_inputs: target_ids, decoder_targets: target_ids } logits_result = session.run([decoder_logits], feed_dict=feed_dict) next_symbol_logits = softmax(logits_result[0][0][-1]) potential_next_ids = [] for __ in range(k): next_id = np.argmax(next_symbol_logits) next_id_value = next_symbol_logits[next_id] potential_next_ids.append((next_id, next_id_value)) # after finding, we delete the element next_symbol_logits = np.delete(next_symbol_logits, next_id) #print("POTENTIAL NEXTS", potential_next_ids) #print(target_vocab.get_words([x[0] for x in potential_next_ids])) #print("POTENTIAL START", potential_next_ids) for new_id in potential_next_ids: if new_id not in [C.EOS_ID, C.PAD_ID]: sent_dict[new_id[1]] = (new_id[0], ) #print("START", sent_dict) else: for prob, sent in sent_dict.items(): target_ids = np.array([C.BOS_ID] + list(sent)).reshape(1, -1) feed_dict = { encoder_inputs: source_ids, decoder_inputs: target_ids, decoder_targets: target_ids } logits_result = session.run([decoder_logits], feed_dict=feed_dict) # first session result, first item in batch, target symbol at last position next_symbol_logits = softmax(logits_result[0][0][-1]) # 1. change: # retrieve k number of highest elements # loop argmax, everytime the highest has been found, delete it and argmax again # till k highest have been found. #~ next_id = np.argmax(next_symbol_logits) potential_next_ids = [] for __ in range(k): next_id = np.argmax(next_symbol_logits) next_id_value = next_symbol_logits[next_id] potential_next_ids.append((next_id, next_id_value)) # after finding, we delete the element next_symbol_logits = np.delete(next_symbol_logits, next_id) #print("POTENTIAL NEXTS", potential_next_ids) #print(target_vocab.get_words([x[0] for x in potential_next_ids])) for new_id in potential_next_ids: #print(sent) #print(new_id) new_sent = list(sent) new_sent.append(new_id[0]) #print(new_sent) new_value = prob * new_id[1] potential_sentences[new_value] = new_sent # clear sent dict for the next loop sent_dict = {} # decide which k sentences are taken potential_sentences = sorted(potential_sentences.items(), reverse=True)[:k] #print("CHOSEN:", potential_sentences) for val, sent in potential_sentences: #print(sent) # if ending in <EOS>, add to finished if sent[-1] in [C.EOS_ID, C.PAD_ID]: finished_sent_dict[val] = sent k -= 1 # else continue else: sent_dict[val] = sent #print("CHOSEN", sent_dict) # normalize the remaining sentences by length-alpha norm_dict = {} for val, sent in finished_sent_dict.items(): if len(sent) > 0: val = np.log10(val) / len(sent)**0.65 norm_dict[val] = sent #print("LEN_NORM", norm_dict) # only return our best translation try: best_sent = sorted(norm_dict.items(), reverse=True)[0][1] except: print("empty line...") print(norm_dict) best_sent = [] #print("BEST", best_sent) words = target_vocab.get_words(best_sent) #print("WORDS", words) return ' '.join(words)