def __init__(self, fname, mode=None): self.fname = fname self.data = exh.read_file(fname) if mode == 'train': self.data = self.data[:50000] self.lens = [len(sentence) for sentence in self.data] self.size = len(self.data)
def run(args): vocab = {} # vocab = {"<pad>": {"id": 0, "freq": float('inf')}, # "<bos>": {"id": 1, "freq": float('inf')}, # "<eos>": {"id": 2, "freq": float('inf')}, # "<unk>": {"id": 3, "freq": float('inf')}} # tokens = ["<pad>", "<bos>", "<eos>", "<unk>"] captions = exh.read_file(args.INPUT) n_captions = len(captions) id_token = 4 for i, caption in enumerate(captions): for word in caption.split(): if word in vocab: vocab[word]["freq"] += 1 vocab[word]["captions"].add(i) else: # id_token = len(tokens) vocab[word] = dict() vocab[word]["freq"] = 1 vocab[word]["id"] = id_token vocab[word]["captions"] = set([i]) id_token += 1 # tokens.append(word) top = sorted(vocab.items(), key=lambda x: x[1]['freq'], reverse=True) n_vocab = len(top) # print(top) # print(n_captions) # print(n_vocab) tots = [] ratios = [] x_ticks = [] covered = set() for i in range(n_vocab): x_ticks.append(i + 1) covered = covered | set(top[i][1]['captions']) tots.append(len(covered)) ratios.append(len(covered) / n_captions) plot_lines(x_ticks, [tots], ['Number of captions covered'], 'voc_tot.png', 'Number of words', 'Number of captions covered', has_legend=False, step=100) plot_lines(x_ticks, [ratios], ['Ratio of captions covered'], 'voc_ratio.png', 'Number of words', 'Ratio of captions covered', has_legend=False, step=100)
def run(args): vocab = { "<pad>": { "id": 0, "freq": float('inf') }, "<bos>": { "id": 1, "freq": float('inf') }, "<eos>": { "id": 2, "freq": float('inf') }, "<unk>": { "id": 3, "freq": float('inf') } } # tokens = ["<pad>", "<bos>", "<eos>", "<unk>"] captions = exh.read_file(args.INPUT) id_token = 4 for caption in captions: for word in caption.split(): if word in vocab: vocab[word]["freq"] += 1 else: # id_token = len(tokens) vocab[word] = dict() vocab[word]["freq"] = 1 vocab[word]["id"] = id_token id_token += 1 # tokens.append(word) top = sorted(vocab.items(), key=lambda x: x[1]['freq'], reverse=True) top = top[:args.MAX + 4] tokens = [None] * (args.MAX + 4) vocab = dict() id_token = 4 for word in top: id = word[1]['id'] if id < 4: vocab[word[0]] = word[1] tokens[word[1]['id']] = word[0] else: vocab[word[0]] = {'id': id_token, 'freq': word[1]['freq']} tokens[id_token] = word[0] id_token += 1 vocab["token_list"] = tokens exh.write_json(vocab, args.OUTPUT)
def run(args): captions = exh.read_file(args.INPUT) norm_captions = [] for caption in captions: norm_caption = "".join([w for w in caption if w not in string.punctuation]) norm_caption = norm_caption.replace(" ", " ").lower() if norm_caption[-1] == " ": norm_caption = norm_caption[:-1] norm_captions.append(norm_caption) filename = args.INPUT.split('/')[-1] folder = args.INPUT[:len(args.INPUT)-len(filename)] output_file = "{0}norm_{1}".format(folder, filename) norm_captions = "\n".join(norm_captions) exh.write_text(norm_captions, output_file)
def __init__(self, fname, vocab, bos=True, key=None, mode=None): self.fname = fname captions = exh.read_file(fname) if mode == 'train': captions = captions[:50000] self.data = [] self.lengths = [] self.key = key for caption in captions: tokens = uvoc.words2tokens(caption, vocab, bos) self.data.append(tokens) self.lengths.append(len(tokens)) self.size = len(self.data)
def run(args): # Get configuration config = exh.load_json(args.CONFIG) # Prepare folders for logging logging = config['logging']['activate'] if logging: exh.create_directory("output") output = os.path.join("output", config['logging']['output_folder']) exh.create_directory(output) # Global initialization torch.cuda.init() device = torch.device(config['cuda']['device'] if ( torch.cuda.is_available() and config['cuda']['ngpu'] > 0) else "cpu") seed = fix_seed(config['seed']) # Load vocabulary vocab = exh.load_json(config['data']['vocab']) # Prepare references references = exh.read_file(config['data']['beam']['captions']) references = prepare_references(references) # Prepare datasets and dataloaders training_dataset = CaptioningDataset(config['data']['train'], "train", vocab, config['sampler']['train']) train_iterator = DataLoader( training_dataset, batch_sampler=training_dataset.sampler, collate_fn=training_dataset.collate_fn, pin_memory=config['iterator']['train']['pin_memory'], num_workers=config['iterator']['train']['num_workers']) beam_dataset = CaptioningDataset(config['data']['beam'], "beam", vocab, config['sampler']['beam']) beam_iterator = DataLoader( beam_dataset, batch_sampler=beam_dataset.sampler, collate_fn=beam_dataset.collate_fn, pin_memory=config['iterator']['beam']['pin_memory'], num_workers=config['iterator']['beam']['num_workers']) # Prepare model weights = None if len(config['model']['embeddings']) > 0: weights = uvoc.init_weights(vocab, config['model']['emb_dim']) uvoc.glove_weights(weights, config['model']['embeddings'], vocab) model = WGAN(len(vocab['token_list']), config['model'], weights) # model = WGANBase(len(vocab['token_list']), config['model'], weights) # model = WGANBaseGP(len(vocab['token_list']), config['model'], weights) # model = WGANBaseLip(len(vocab['token_list']), config['model'], weights) # model = RelativisticGAN(len(vocab['token_list']), config['model'], weights) model.reset_parameters() lr = config['model']['optimizers']['lr'] betas = (config['model']['optimizers']['betas']['min'], config['model']['optimizers']['betas']['max']) weight_decay = config['model']['optimizers']['weight_decay'] optim_D = optim.Adam(model.D.parameters(), lr=lr, betas=betas, weight_decay=weight_decay) optim_G = optim.Adam(model.G.parameters(), lr=lr, betas=betas, weight_decay=weight_decay) model.to(device) fix_seed(config['seed'] + 1) generator_trained = config['model']['generator']['train_iteration'] scores = {"BLEU": [], "G_loss_train": [], "D_loss_train": []} max_bleu = config['BLEU']['max_bleu'] bleus = [[]] * max_bleu best_bleu = (0, 1) # torch.autograd.set_detect_anomaly(True) model.train(True) torch.set_grad_enabled(True) # for epoch in range(config['max_epoch']): epoch = 1 cpt = 0 while True: secs = time.time() print("Starting Epoch {}".format(epoch)) iteration = 1 d_batch = 0 g_batch = 0 d_loss = 0 g_loss = 0 for batch in train_iterator: batch.device(device) out = model(batch, optim_G, optim_D, epoch, iteration) d_loss += out['D_loss'] d_batch += 1 g_loss += out['G_loss'] g_batch += 1 iteration += 1 print( "Training : Mean G loss : {} / Mean D loss : {} ({} seconds elapsed)" .format(g_loss / g_batch, d_loss / d_batch, time.time() - secs)) scores['G_loss_train'].append((g_loss / g_batch)) scores['D_loss_train'].append((d_loss / d_batch)) # Validation model.train(False) torch.set_grad_enabled(False) # Beam search print("Beam search...") # generated_sentences = beam_search(model.G, beam_iterator, vocab, config['beam_search'], device) # generated_sentences = beam_search([model], beam_iterator, vocab, beam_size=config['beam_search']['beam_size'], max_len=config['beam_search']['max_len'], device=device) generated_sentences = max_search( model, beam_iterator, vocab, max_len=config['beam_search']['max_len'], device=device) # BLEU score # for n in range(3,max_bleu): # score = bleu_score(references, generated_sentences, n+1) # bleus[n].append(score) # print("BLEU-{} score : {}".format(n+1, score)) score = bleu_score(references, generated_sentences, max_bleu) bleus[max_bleu - 1].append(score) print("BLEU-{} score : {}".format(max_bleu, score)) if score > best_bleu[0]: best_bleu = (score, epoch) filename = 'output_epoch{}_bleu{}'.format(epoch, score) out_file = os.path.join(output, filename) torch.save(model.state_dict(), out_file) print("Best BLEU so far : {} (Epoch {})".format( best_bleu[0], best_bleu[1])) if logging: output_file = 'output_{}'.format(epoch) output_sentences = os.path.join(output, output_file) exh.write_text('\n'.join(generated_sentences), output_sentences) model.train(True) torch.set_grad_enabled(True) print("Epoch finished in {} seconds".format(time.time() - secs)) if epoch - best_bleu[1] == 3: break epoch += 1 if logging: scores['BLEU'] = bleus output_scores = os.path.join(output, 'scores.json') exh.write_json(scores, output_scores) print("Scores saved in {}".format(output_scores))
def run(args): print(torch.backends.cudnn.benchmark) torch.backends.cudnn.deterministic = True # Get configuration config = exh.load_json(args.CONFIG) # Global initialization torch.cuda.init() device = torch.device(config['cuda']['device'] if ( torch.cuda.is_available() and config['cuda']['ngpu'] > 0) else "cpu") seed = fix_seed(config['seed']) # Load vocabulary vocab = exh.load_json(config['data']['vocab']) # Prepare references references = exh.read_file(config['data']['test']['captions']) references = prepare_references(references) beam_dataset = CaptioningDataset(config['data']['test'], "beam", vocab, config['sampler']['test']) beam_iterator = DataLoader( beam_dataset, batch_sampler=beam_dataset.sampler, collate_fn=beam_dataset.collate_fn, pin_memory=config['iterator']['test']['pin_memory'], num_workers=config['iterator']['test']['num_workers']) # Prepare model weights = None if len(config['model']['embeddings']) > 0: weights = uvoc.init_weights(vocab, config['model']['emb_dim']) uvoc.glove_weights(weights, config['model']['embeddings'], vocab) model = WGAN(len(vocab['token_list']), config['model'], weights) model.reset_parameters() print("The state dict keys: \n\n", model.state_dict().keys()) model.load_state_dict(torch.load(config['load_dict'])) for param in list(model.parameters()): param.requires_grad = False c = torch.load(config['load_dict']) for x in model.state_dict(): if len(model.state_dict()[x].shape) == 1: model.state_dict()[x][:] = c[x] elif len(model.state_dict()[x].shape) == 2: model.state_dict()[x][:, :] = c[x] model.to(device) fix_seed(config['seed'] + 1) model.train(False) torch.set_grad_enabled(False) model.eval() model.G.emb.weight.data = c['G.emb.weight'] generated_sentences = max_search(model, beam_iterator, vocab, max_len=config['beam_search']['max_len'], device=device) output_file = 'output_argmax' output_sentences = output_file exh.write_text('\n'.join(generated_sentences), output_sentences) score = bleu_score(references, generated_sentences, 4) print(score) generated_sentences = beam_search( [model], beam_iterator, vocab, beam_size=config['beam_search']['beam_size'], max_len=config['beam_search']['max_len'], device=device) output_file = 'output_beam' output_sentences = output_file exh.write_text('\n'.join(generated_sentences), output_sentences) score = bleu_score(references, generated_sentences, 4) print(score)