def load_multi30k(config, device): csv_dir_path = get_or_create_dir('.data', 'multi30k') if not os.path.exists(f'{csv_dir_path}/train.csv'): source_field = torchtext.data.Field(tokenize=tokenize_de) target_field = torchtext.data.Field(tokenize=tokenize_en) torchtext.datasets.Multi30k.splits(exts=('.de', '.en'), fields=(source_field, target_field)) create_multi30k() return load_from_csv(config, csv_dir_path, tokenize_de, tokenize_en, device)
def train(config, sample_validation_batches): source_language = config.get('src_language') target_language = config.get('trg_language') EOS_token = config.get('EOS_token') PAD_token = config.get('PAD_token') SOS_token = config.get('SOS_token') train_iter = config.get('train_iter') val_iter = config.get('val_iter') writer_path = config.get('writer_path') writer_train_path = get_or_create_dir(writer_path, 'train') writer_val_path = get_or_create_dir(writer_path, 'val') writer_train = SummaryWriter(log_dir=writer_train_path) writer_val = SummaryWriter(log_dir=writer_val_path) epochs = config.get('epochs') training = config.get('training') eval_every = training.get('eval_every') sample_every = training.get('sample_every') use_attention = config.get('use_attention') step = 1 for epoch in range(epochs): print(f'Epoch: {epoch+1}/{epochs}') save_weights(config) for i, training_batch in enumerate(train_iter): loss = train_batch(config, training_batch) writer_train.add_scalar('loss', loss, step) if step == 1 or step % eval_every == 0: val_lengths = 0 val_losses = 0 reference_corpus = [] translation_corpus = [] for val_batch in val_iter: val_loss, translations = evaluate_batch(config, val_batch) val_lengths += 1 val_losses += val_loss val_batch_trg, _ = val_batch.trg _, batch_size = val_batch_trg.shape references = map( lambda i: torch2words(target_language, val_batch_trg[:, i]), range(batch_size)) references = map( lambda words: [ list( filter_words(words, SOS_token, EOS_token, PAD_token)) ], references) reference_corpus.extend(references) translations = map( lambda translation: list2words( target_language, translation), translations) translations = map( lambda words: list( filter_words(words, SOS_token, EOS_token, PAD_token )), translations) translation_corpus.extend(translations) bleu = compute_bleu(reference_corpus, translation_corpus) val_loss = val_losses / val_lengths writer_val.add_scalar('bleu', bleu, step) writer_val.add_scalar('loss', val_loss, step) if step % sample_every == 0: val_batch = sample_validation_batches(1) val_batch_src, val_lengths_src = val_batch.src val_batch_trg, _ = val_batch.trg s0 = val_lengths_src[0].item() _, translations, attention_weights = evaluate_batch( config, val_batch, True) source_words = torch2words(source_language, val_batch_src[:, 0]) target_words = torch2words(target_language, val_batch_trg[:, 0]) translation_words = list( filter(lambda word: word != PAD_token, list2words(target_language, translations[0]))) if use_attention and sum(attention_weights.shape) != 0: attention_figure = visualize_attention( source_words[:s0], translation_words, with_cpu(attention_weights)) writer_val.add_figure('attention', attention_figure, step) text = get_text(source_words, target_words, translation_words, SOS_token, EOS_token, PAD_token) writer_val.add_text('translation', text, step) step += 1 save_weights(config)
def load_dummy_variable_length(config, device): csv_dir_path = get_or_create_dir('.data', 'dummy_variable_length') if not os.path.exists(f'{csv_dir_path}/train.csv'): create_dummy_variable_length_csv() return load_from_csv(config, csv_dir_path, tokenize_dummy, tokenize_dummy, device)
def load_debug(config, device): csv_dir_path = get_or_create_dir('.data', 'debug') if not os.path.exists(f'{csv_dir_path}/train.csv'): create_debug_csv() return load_from_csv(config, csv_dir_path, tokenize_de, tokenize_en, device)