def main(): args = get_arguments() SETTING = Dict(yaml.safe_load(open(os.path.join('arguments',args.arg+'.yaml'), encoding='utf8'))) print(args) args.device = list (map(str,args.device)) os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(args.device) # image transformer transform = transforms.Compose([ transforms.Resize((SETTING.imsize, SETTING.imsize)), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) if args.dataset == 'coco': val_dset = CocoDset(root=SETTING.root_path, img_dir='val2017', ann_dir='annotations/captions_val2017.json', transform=transform) val_loader = DataLoader(val_dset, batch_size=SETTING.batch_size, shuffle=False, num_workers=SETTING.n_cpu, collate_fn=collater) vocab = Vocabulary(max_len=SETTING.max_len) vocab.load_vocab(args.vocab_path) imenc = ImageEncoder(SETTING.out_size, SETTING.cnn_type) capenc = CaptionEncoder(len(vocab), SETTING.emb_size, SETTING.out_size, SETTING.rnn_type) device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") imenc = imenc.to(device) capenc = capenc.to(device) assert args.checkpoint is not None print("loading model and optimizer checkpoint from {} ...".format(args.checkpoint), flush=True) ckpt = torch.load(args.checkpoint, map_location=device) imenc.load_state_dict(ckpt["encoder_state"]) capenc.load_state_dict(ckpt["decoder_state"]) begin = time.time() dset = EmbedDset(val_loader, imenc, capenc, vocab, args) print("database created | {} ".format(sec2str(time.time()-begin)), flush=True) savedir = os.path.join("out", args.config_name) if not os.path.exists(savedir): os.makedirs(savedir, 0o777) image = dset.embedded["image"] caption = dset.embedded["caption"] n_i = image.shape[0] n_c = caption.shape[0] all = np.concatenate([image, caption], axis=0) emb_file = os.path.join(savedir, "embedding_{}.npy".format(n_i)) save_file = os.path.join(savedir, "{}.npy".format(SETTING.method)) vis_file = os.path.join(savedir, "{}.png".format(SETTING.method)) np.save(emb_file, all) print("saved embeddings to {}".format(emb_file), flush=True) dimension_reduction(emb_file, save_file, method=SETTING.method) plot_embeddings(save_file, n_i, vis_file, method=SETTING.method)
def main(): args = get_arguments() SETTING = Dict( yaml.safe_load( open(os.path.join('arguments', args.arg + '.yaml'), encoding='utf8'))) print(args) args.device = list(map(str, args.device)) os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(args.device) transform = transforms.Compose([ transforms.Resize((SETTING.imsize, SETTING.imsize)), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) if args.dataset == 'coco': val_dset = CocoDset(root=SETTING.root_path, img_dir='val2017', ann_dir='annotations/captions_val2017.json', transform=transform) val_loader = DataLoader(val_dset, batch_size=SETTING.batch_size, shuffle=False, num_workers=SETTING.n_cpu, collate_fn=collater) vocab = Vocabulary(max_len=SETTING.max_len) vocab.load_vocab(args.vocab_path) imenc = ImageEncoder(SETTING.out_size, SETTING.cnn_type) capenc = CaptionEncoder(len(vocab), SETTING.emb_size, SETTING.out_size, SETTING.rnn_type) device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") imenc = imenc.to(device) capenc = capenc.to(device) assert SETTING.checkpoint is not None print("loading model and optimizer checkpoint from {} ...".format( SETTING.checkpoint), flush=True) ckpt = torch.load(SETTING.checkpoint) imenc.load_state_dict(ckpt["encoder_state"]) capenc.load_state_dict(ckpt["decoder_state"]) begin = time.time() dset = EmbedDset(val_loader, imenc, capenc, vocab, args) print("database created | {} ".format(sec2str(time.time() - begin)), flush=True) retrieve_i2c(dset, val_dset, args.image_path, imenc, transform) retrieve_c2i(dset, val_dset, args.output_dir, args.caption, capenc, vocab)
def __init__(self, config: SimilarityConfig): super(Similarity, self).__init__() self.text_encoder = TextEncoder(config.product_text_encoder_config) self.text_encoder = self.text_encoder.to(GlobalConfig.device) self.image_encoder = ImageEncoder(config.product_image_encoder_config) self.image_encoder = self.image_encoder.to(GlobalConfig.device) self.linear = nn.Linear(config.mm_size, config.context_vector_size) self.linear = self.linear.to(GlobalConfig.device)
def main(): args = parse_args() transform = transforms.Compose([ transforms.Resize((args.imsize, args.imsize)), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) if args.dataset == 'coco': val_dset = CocoDataset(root=args.root_path, imgdir='val2017', jsonfile='annotations/captions_val2017.json', transform=transform, mode='all') val_loader = DataLoader(val_dset, batch_size=args.batch_size, shuffle=False, num_workers=args.n_cpu, collate_fn=collater_eval) vocab = Vocabulary(max_len=args.max_len) vocab.load_vocab(args.vocab_path) imenc = ImageEncoder(args.out_size, args.cnn_type) capenc = CaptionEncoder(len(vocab), args.emb_size, args.out_size, args.rnn_type) device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") imenc = imenc.to(device) capenc = capenc.to(device) assert args.checkpoint is not None print("loading model and optimizer checkpoint from {} ...".format( args.checkpoint), flush=True) ckpt = torch.load(args.checkpoint) imenc.load_state_dict(ckpt["encoder_state"]) capenc.load_state_dict(ckpt["decoder_state"]) begin = time.time() dset = EmbedDataset(val_loader, imenc, capenc, vocab, args) print("database created | {} ".format(sec2str(time.time() - begin)), flush=True) retrieve_i2c(dset, val_dset, imenc, vocab, args) retrieve_c2i(dset, val_dset, capenc, vocab, args)
def build_model(config, gpu_id, checkpoint=None): # Build encoder encoder = ImageEncoder(config.encoder_num_layers, True, config.encoder_num_hidden, config.dropout, config.image_channel_size) # Build decoder decoder_num_hidden = config.encoder_num_hidden decoder = RNNDecoder(True, config.target_embedding_size, config.decoder_num_layers, decoder_num_hidden, config.dropout, config.target_vocab_size, attn_type='general', input_feed=config.input_feed) device = torch.device('cuda') if gpu_id >= 0 else torch.device('cpu') # Build Generator generator = nn.Sequential( nn.Linear(decoder_num_hidden, config.target_vocab_size), nn.LogSoftmax(dim=-1)) # Build UIModel model = UIModel(encoder, decoder, generator) # Load the model states from checkpoint or initialize them if checkpoint is not None: model.load_state_dict(checkpoint['model']) else: for p in model.parameters(): p.data.uniform_(-config.param_init, config.param_init) model.to(device) return model
def main(): args = parse_args() # Prepare dataset (for vocab) dataset = ImageCaptionDataset('', args.caption_path) # Load image img = dataset.parse_image(args.image) # Load model encoder = ImageEncoder('cpu', pretrained=args.use_pretrained) decoder = CaptionDecoder('cpu', len(dataset.vocab), embedding_dim=args.embedding_dim, enc_hidden_dim=encoder.hidden_dim, dec_hidden_dim=args.dec_hidden_dim, word_to_int=dataset.word_to_int) encoder.load_state_dict(torch.load(args.encoder)) decoder.load_state_dict(torch.load(args.decoder)) # Generate caption gen_caption(encoder, decoder, img, dataset, args)
def __init__(self, vocab_size, word_embed_dim, hidden_size, resnet_out): super(Model, self).__init__() self.ques_encoder = QuestionEncoder(vocab_size, word_embed_dim, hidden_size) self.img_encoder = ImageEncoder(resnet_out + hidden_size, hidden_size) self.joint_embed = JointEmbedding(hidden_size, resnet_out, hidden_size)
class Similarity(nn.Module): def __init__(self, config: SimilarityConfig): super(Similarity, self).__init__() self.text_encoder = TextEncoder(config.product_text_encoder_config) self.text_encoder = self.text_encoder.to(GlobalConfig.device) self.image_encoder = ImageEncoder(config.product_image_encoder_config) self.image_encoder = self.image_encoder.to(GlobalConfig.device) self.linear = nn.Linear(config.mm_size, config.context_vector_size) self.linear = self.linear.to(GlobalConfig.device) def forward(self, context, text, text_length, image): """Forward. Args: context: Context (batch_size, ContextEncoderConfig.output_size). text: Product text (batch_size, product_text_max_len). text_length: Product text length (batch_size, ). image: Product image (batch_size, 3, image_size, image_size). Returns: """ batch_size = context.size(0) sos = SOS_ID * torch.ones(batch_size, dtype=torch.long).view(-1, 1).to( GlobalConfig.device) # (batch_size) # Concat SOS. text = torch.cat((sos, text), 1).to(GlobalConfig.device) # (batch_size, product_text_max_len) text_length += 1 # (batch_size, ) encoded_text, _ = self.text_encoder(text, text_length) # (batch_size, text_feat_size) encoded_image = self.image_encoder(image, encoded_text) # (batch_size, image_feat_size) mm = torch.cat((encoded_text, encoded_image), 1) mm = mm.to(GlobalConfig.device) mm = self.linear(mm) return cosine_similarity(context, mm)
def build_models(self): # ###################encoders######################################## # image_encoder = ImageEncoder(output_channels=cfg.hidden_dim) if cfg.text_encoder_path != '': img_encoder_path = cfg.text_encoder_path.replace('text_encoder', 'image_encoder') print('Load image encoder from:', img_encoder_path) state_dict = torch.load(img_encoder_path, map_location='cpu') if 'model' in state_dict.keys(): image_encoder.load_state_dict(state_dict['model']) else: image_encoder.load_state_dict(state_dict) for p in image_encoder.parameters(): # make image encoder grad on p.requires_grad = True # image_encoder.eval() epoch = 0 ################################################################### text_encoder = TextEncoder(bert_config = self.bert_config) if cfg.text_encoder_path != '': epoch = cfg.text_encoder_path[istart:iend] epoch = int(epoch) + 1 text_encoder_path = cfg.text_encoder_path print('Load text encoder from:', text_encoder_path) state_dict = torch.load(text_encoder_path, map_location='cpu') if 'model' in state_dict.keys(): text_encoder.load_state_dict(state_dict['model']) else: text_encoder.load_state_dict(state_dict) for p in text_encoder.parameters(): # make text encoder grad on p.requires_grad = True # ########################################################### # if cfg.CUDA: text_encoder = text_encoder.cuda() image_encoder = image_encoder.cuda() return [text_encoder, image_encoder, epoch]
def main(): args = parse_args() print('BATCH_SIZE: {}'.format(args.batch_size)) print('EMBEDDING_DIM: {}'.format(args.embedding_dim)) print('DEC_HIDDEN_DIM: {}'.format(args.dec_hidden_dim)) print('LR: {}'.format(args.lr)) print('ENCODER DROPOUT: {}'.format(args.enc_dropout)) print('DECODER DROPOUT: {}'.format(args.dec_dropout)) print('EPOCHS: {}'.format(args.epochs)) print('LOG_INTERVAL: {}'.format(args.log_interval)) print('USE PRETRAINED: {}'.format(args.use_pretrained)) print('USE CURRICULUM LEARNING: {}'.format(args.use_curriculum_learning)) # Prepare data & split dataset = ImageCaptionDataset(args.image_folder, args.caption_path) train_set, test_set = dataset.random_split(train_portion=0.8) train_dataloader = DataLoader(train_set, batch_size=args.batch_size, shuffle=True) test_dataloader = DataLoader(test_set, batch_size=args.batch_size) print('Training set size: {}'.format(len(train_set))) print('Test set size: {}'.format(len(test_set))) print('Vocab size: {}'.format(len(dataset.vocab))) print('----------------------------') # Create model & optimizer encoder = ImageEncoder(device, pretrained=args.use_pretrained).to(device) decoder = CaptionDecoder(device, len(dataset.vocab), embedding_dim=args.embedding_dim, enc_hidden_dim=encoder.hidden_dim, dec_hidden_dim=args.dec_hidden_dim, dropout=args.dec_dropout, use_pretrained_emb=args.use_pretrained, word_to_int=dataset.word_to_int).to(device) enc_optimizer = torch.optim.Adam(encoder.parameters(), lr=args.lr) dec_optimizer = torch.optim.Adam(decoder.parameters(), lr=args.lr) # Train train(encoder, decoder, enc_optimizer, dec_optimizer, train_dataloader, dataset, args) # Save model torch.save(encoder.cpu().state_dict(), args.output_encoder) torch.save(decoder.cpu().state_dict(), args.output_decoder) encoder.to(device) decoder.to(device) # Test test(encoder, decoder, test_dataloader, dataset, args)
def recommend_valid( context_text_encoder: TextEncoder, context_image_encoder: ImageEncoder, context_encoder: ContextEncoder, similarity: Similarity, valid_dataset: Dataset): """Recommend valid. Args: context_text_encoder (TextEncoder): Context text encoder. context_image_encoder (ImageEncoder): Context image encoder. context_encoder (ContextEncoder): Context encoder. similarity (Similarity): Intention. valid_dataset (Dataset): Valid dataset. """ # Valid dataset loader. valid_data_loader = DataLoader( valid_dataset, batch_size=RecommendValidConfig.batch_size, shuffle=True, num_workers=RecommendValidConfig.num_data_loader_workers ) sum_loss = 0 num_batches = 0 # Switch to eval mode. context_text_encoder.eval() context_image_encoder.eval() context_encoder.eval() # similarity.eval() # There might be a bug in the implement of resnet. num_ranks = torch.zeros(DatasetConfig.neg_images_max_num + 1, dtype=torch.long) num_ranks = num_ranks.to(GlobalConfig.device) total_samples = 0 with torch.no_grad(): for batch_id, valid_data in enumerate(valid_data_loader): # Only valid `ValidConfig.num_batches` batches. if batch_id >= RecommendValidConfig.num_batches: break num_batches += 1 context_dialog, pos_products, neg_products = valid_data texts, text_lengths, images, utter_types = context_dialog # Sizes: # texts: (batch_size, dialog_context_size + 1, dialog_text_max_len) # text_lengths: (batch_size, dialog_context_size + 1) # images: (batch_size, dialog_context_size + 1, # pos_images_max_num, 3, image_size, image_size) # utter_types: (batch_size, ) batch_size = texts.size(0) # To device. texts = texts.to(GlobalConfig.device) text_lengths = text_lengths.to(GlobalConfig.device) images = images.to(GlobalConfig.device) # utter_types = utter_types.to(GlobalConfig.device) texts.transpose_(0, 1) # (dialog_context_size + 1, batch_size, dialog_text_max_len) text_lengths.transpose_(0, 1) # (dialog_context_size + 1, batch_size) images.transpose_(0, 1) images.transpose_(1, 2) # (dialog_context_size + 1, pos_images_max_num, batch_size, 3, # image_size, image_size) # Encode context. context, _ = encode_context( context_text_encoder, context_image_encoder, context_encoder, texts, text_lengths, images ) # (batch_size, context_vector_size) loss = recommend_loss(similarity, batch_size, context, pos_products, neg_products) sum_loss += loss num_rank = recommend_eval( similarity, batch_size, context, pos_products, neg_products ) total_samples += batch_size num_ranks += num_rank for i in range(DatasetConfig.neg_images_max_num): print('total recall@{} = {}'.format( i + 1, torch.sum(num_ranks[:i + 1]).item() / total_samples)) # Switch to train mode. context_text_encoder.train() context_image_encoder.train() context_encoder.train() similarity.train() return sum_loss / num_batches
def knowledge_test( context_text_encoder: TextEncoder, context_image_encoder: ImageEncoder, context_encoder: ContextEncoder, to_hidden: ToHidden, attribute_kv_memory: KVMemory, text_decoder: TextDecoder, test_dataset: Dataset, text_length: int, vocab: Dict[str, int]): """Knowledge attribute test. Args: context_text_encoder (TextEncoder): Context text encoder. context_image_encoder (ImageEncoder): Context image encoder. context_encoder (ContextEncoder): Context encoder. to_hidden (ToHidden): Context to hidden. attribute_kv_memory (KVMemory): Attribute Key-Value Memory. text_decoder (TextDecoder): Text decoder. test_dataset (Dataset): Valid dataset. text_length (int): Text length. vocab (Dict[str, int]): Vocabulary. """ id2word: List[str] = [None] * len(vocab) for word, wid in vocab.items(): id2word[wid] = word # Test dataset loader. test_data_loader = DataLoader( test_dataset, batch_size=KnowledgeAttributeTestConfig.batch_size, num_workers=KnowledgeAttributeTestConfig.num_data_loader_workers ) sum_loss = 0 num_batches = 0 # Switch to eval mode. context_text_encoder.eval() context_image_encoder.eval() context_encoder.eval() to_hidden.eval() attribute_kv_memory.eval() text_decoder.eval() output_file = open('knowledge_attribute.out', 'w') with torch.no_grad(): for batch_id, test_data in enumerate(test_data_loader): num_batches += 1 test_data, products = test_data keys, values, pair_length = products keys = keys.to(GlobalConfig.device) values = values.to(GlobalConfig.device) pair_length = pair_length.to(GlobalConfig.device) texts, text_lengths, images, utter_types = test_data # Sizes: # texts: (batch_size, dialog_context_size + 1, dialog_text_max_len) # text_lengths: (batch_size, dialog_context_size + 1) # images: (batch_size, dialog_context_size + 1, # pos_images_max_num, 3, image_size, image_size) # utter_types: (batch_size, ) # To device. texts = texts.to(GlobalConfig.device) text_lengths = text_lengths.to(GlobalConfig.device) images = images.to(GlobalConfig.device) texts.transpose_(0, 1) # (dialog_context_size + 1, batch_size, dialog_text_max_len) text_lengths.transpose_(0, 1) # (dialog_context_size + 1, batch_size) images.transpose_(0, 1) images.transpose_(1, 2) # (dialog_context_size + 1, pos_images_max_num, batch_size, 3, # image_size, image_size) # Encode context. context, hiddens = encode_context( context_text_encoder, context_image_encoder, context_encoder, texts, text_lengths, images ) # (batch_size, context_vector_size) encode_knowledge_func = partial(attribute_kv_memory, keys, values, pair_length) text_eval(to_hidden, text_decoder, text_length, id2word, context, texts[-1], hiddens, encode_knowledge_func, output_file=output_file) output_file.close()
def knowledge_celebrity_test(context_text_encoder: TextEncoder, context_image_encoder: ImageEncoder, context_encoder: ContextEncoder, to_hidden: ToHidden, celebrity_memory: Memory, text_decoder: TextDecoder, test_dataset: Dataset, celebrity_scores, text_length: int, vocab: Dict[str, int]): """Knowledge celebrity test. Args: context_text_encoder (TextEncoder): Context text encoder. context_image_encoder (ImageEncoder): Context image encoder. context_encoder (ContextEncoder): Context encoder. to_hidden (ToHidden): Context to hidden. celebrity_memory (Memory): Celebrity Memory. text_decoder (TextDecoder): Text decoder. test_dataset (Dataset): Valid dataset. celebrity_scores: Celebrity scores. text_length (int): Text length. vocab (Dict[str, int]): Vocabulary. """ id2word: List[str] = [None] * len(vocab) for word, wid in vocab.items(): id2word[wid] = word # Test dataset loader. test_data_loader = DataLoader( test_dataset, batch_size=KnowledgeCelebrityTestConfig.batch_size, num_workers=KnowledgeCelebrityTestConfig.num_data_loader_workers) sum_loss = 0 # Switch to eval mode. context_text_encoder.eval() context_image_encoder.eval() context_encoder.eval() to_hidden.eval() celebrity_memory.eval() text_decoder.eval() output_file = open('knowledge_celebrity.out', 'w') with torch.no_grad(): for batch_id, test_data in enumerate(test_data_loader): texts, text_lengths, images, utter_types = test_data # Sizes: # texts: (batch_size, dialog_context_size + 1, dialog_text_max_len) # text_lengths: (batch_size, dialog_context_size + 1) # images: (batch_size, dialog_context_size + 1, # pos_images_max_num, 3, image_size, image_size) # utter_types: (batch_size, ) # To device. texts = texts.to(GlobalConfig.device) text_lengths = text_lengths.to(GlobalConfig.device) images = images.to(GlobalConfig.device) texts.transpose_(0, 1) # (dialog_context_size + 1, batch_size, dialog_text_max_len) text_lengths.transpose_(0, 1) # (dialog_context_size + 1, batch_size) images.transpose_(0, 1) images.transpose_(1, 2) # (dialog_context_size + 1, pos_images_max_num, batch_size, 3, # image_size, image_size) # Encode context. context, hiddens = encode_context(context_text_encoder, context_image_encoder, context_encoder, texts, text_lengths, images) # (batch_size, context_vector_size) knowledge_entry = celebrity_scores encode_knowledge_func = partial(celebrity_memory, knowledge_entry) text_eval(to_hidden, text_decoder, text_length, id2word, context, texts[-1], hiddens, encode_knowledge_func, output_file=output_file) output_file.close()
def intention_valid(context_text_encoder: TextEncoder, context_image_encoder: ImageEncoder, context_encoder: ContextEncoder, intention: Intention, valid_dataset: Dataset): """Intention valid. Args: context_text_encoder (TextEncoder): Context text encoder. context_image_encoder (ImageEncoder): Context image encoder. context_encoder (ContextEncoder): Context encoder. intention (Intention): Intention. valid_dataset (Dataset): Valid dataset. """ # Valid dataset loader. valid_data_loader = DataLoader( valid_dataset, batch_size=IntentionValidConfig.batch_size, shuffle=True, num_workers=IntentionValidConfig.num_data_loader_workers) sum_loss = 0 sum_accuracy = 0 num_batches = 0 # Switch to eval mode. context_text_encoder.eval() context_image_encoder.eval() context_encoder.eval() intention.eval() with torch.no_grad(): for batch_id, valid_data in enumerate(valid_data_loader): # Only valid `ValidConfig.num_batches` batches. if batch_id >= IntentionValidConfig.num_batches: break num_batches += 1 texts, text_lengths, images, utter_types = valid_data # Sizes: # texts: (batch_size, dialog_context_size + 1, dialog_text_max_len) # text_lengths: (batch_size, dialog_context_size + 1) # images: (batch_size, dialog_context_size + 1, # pos_images_max_num, 3, image_size, image_size) # utter_types: (batch_size, ) # To device. texts = texts.to(GlobalConfig.device) text_lengths = text_lengths.to(GlobalConfig.device) images = images.to(GlobalConfig.device) utter_types = utter_types.to(GlobalConfig.device) texts.transpose_(0, 1) # (dialog_context_size + 1, batch_size, dialog_text_max_len) text_lengths.transpose_(0, 1) # (dialog_context_size + 1, batch_size) images.transpose_(0, 1) images.transpose_(1, 2) # (dialog_context_size + 1, pos_images_max_num, batch_size, 3, # image_size, image_size) # Encode context. context, _ = encode_context(context_text_encoder, context_image_encoder, context_encoder, texts, text_lengths, images) # (batch_size, context_vector_size) intent_prob = intention(context) # (batch_size, utterance_type_size) loss = nll_loss(intent_prob, utter_types) sum_loss += loss eqs = torch.eq(torch.argmax(intent_prob, dim=1), utter_types) accuracy = torch.sum(eqs).item() * 1.0 / eqs.size(0) sum_accuracy += accuracy # Switch to train mode. context_text_encoder.train() context_image_encoder.train() context_encoder.train() intention.train() return sum_loss / num_batches, sum_accuracy / num_batches
def train(task: int, model_file_name: str): """Train model. Args: task (int): Task. model_file_name (str): Model file name (saved or to be saved). """ # Check if data exists. if not isfile(DatasetConfig.common_raw_data_file): raise ValueError('No common raw data.') # Load extracted common data. common_data: CommonData = load_pkl(DatasetConfig.common_raw_data_file) # Dialog data files. train_dialog_data_file = DatasetConfig.get_dialog_filename( task, TRAIN_MODE) valid_dialog_data_file = DatasetConfig.get_dialog_filename( task, VALID_MODE) test_dialog_data_file = DatasetConfig.get_dialog_filename(task, TEST_MODE) if not isfile(train_dialog_data_file): raise ValueError('No train dialog data file.') if not isfile(valid_dialog_data_file): raise ValueError('No valid dialog data file.') # Load extracted dialogs. train_dialogs: List[TidyDialog] = load_pkl(train_dialog_data_file) valid_dialogs: List[TidyDialog] = load_pkl(valid_dialog_data_file) test_dialogs: List[TidyDialog] = load_pkl(test_dialog_data_file) if task in {KNOWLEDGE_TASK}: knowledge_data = KnowledgeData() # Dataset wrap. train_dataset = Dataset( task, common_data.dialog_vocab, None, #common_data.obj_id, train_dialogs, knowledge_data if task == KNOWLEDGE_TASK else None) valid_dataset = Dataset( task, common_data.dialog_vocab, None, #common_data.obj_id, valid_dialogs, knowledge_data if task == KNOWLEDGE_TASK else None) test_dataset = Dataset( task, common_data.dialog_vocab, None, #common_data.obj_id, test_dialogs, knowledge_data if task == KNOWLEDGE_TASK else None) print('Train dataset size:', len(train_dataset)) print('Valid dataset size:', len(valid_dataset)) print('Test dataset size:', len(test_dataset)) # Get initial embedding. vocab_size = len(common_data.dialog_vocab) embed_init = get_embed_init(common_data.glove, vocab_size).to(GlobalConfig.device) # Context model configurations. context_text_encoder_config = ContextTextEncoderConfig( vocab_size, embed_init) context_image_encoder_config = ContextImageEncoderConfig() context_encoder_config = ContextEncoderConfig() # Context models. context_text_encoder = TextEncoder(context_text_encoder_config) context_text_encoder = context_text_encoder.to(GlobalConfig.device) context_image_encoder = ImageEncoder(context_image_encoder_config) context_image_encoder = context_image_encoder.to(GlobalConfig.device) context_encoder = ContextEncoder(context_encoder_config) context_encoder = context_encoder.to(GlobalConfig.device) # Load model file. model_file = join(DatasetConfig.dump_dir, model_file_name) if isfile(model_file): state = torch.load(model_file) # if task != state['task']: # raise ValueError("Task doesn't match.") context_text_encoder.load_state_dict(state['context_text_encoder']) context_image_encoder.load_state_dict(state['context_image_encoder']) context_encoder.load_state_dict(state['context_encoder']) # Task-specific parts. if task == INTENTION_TASK: intention_train(context_text_encoder, context_image_encoder, context_encoder, train_dataset, valid_dataset, test_dataset, model_file) elif task == TEXT_TASK: text_train(context_text_encoder, context_image_encoder, context_encoder, train_dataset, valid_dataset, test_dataset, model_file, common_data.dialog_vocab, embed_init) elif task == RECOMMEND_TASK: recommend_train(context_text_encoder, context_image_encoder, context_encoder, train_dataset, valid_dataset, test_dataset, model_file, vocab_size, embed_init) elif task == KNOWLEDGE_TASK: knowledge_attribute_train(context_text_encoder, context_image_encoder, context_encoder, train_dataset, valid_dataset, test_dataset, model_file, knowledge_data.attribute_data, common_data.dialog_vocab, embed_init)
def main(): args = parse_args() transform = transforms.Compose([ transforms.Resize((args.imsize, args.imsize)), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) if args.dataset == 'coco': train_dset = CocoDataset(root=args.root_path, transform=transform, mode='one') val_dset = CocoDataset(root=args.root_path, imgdir='val2017', jsonfile='annotations/captions_val2017.json', transform=transform, mode='all') train_loader = DataLoader(train_dset, batch_size=args.batch_size, shuffle=True, num_workers=args.n_cpu, collate_fn=collater_train) val_loader = DataLoader(val_dset, batch_size=args.batch_size, shuffle=False, num_workers=args.n_cpu, collate_fn=collater_eval) vocab = Vocabulary(max_len=args.max_len) vocab.load_vocab(args.vocab_path) imenc = ImageEncoder(args.out_size, args.cnn_type) capenc = CaptionEncoder(len(vocab), args.emb_size, args.out_size, args.rnn_type) device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") imenc = imenc.to(device) capenc = capenc.to(device) optimizer = optim.SGD([{ 'params': imenc.parameters(), 'lr': args.lr_cnn, 'momentum': args.mom_cnn }, { 'params': capenc.parameters(), 'lr': args.lr_rnn, 'momentum': args.mom_rnn }]) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.1, patience=args.patience, verbose=True) lossfunc = PairwiseRankingLoss(margin=args.margin, method=args.method, improved=args.improved, intra=args.intra) if args.checkpoint is not None: print("loading model and optimizer checkpoint from {} ...".format( args.checkpoint), flush=True) ckpt = torch.load(args.checkpoint) imenc.load_state_dict(ckpt["encoder_state"]) capenc.load_state_dict(ckpt["decoder_state"]) optimizer.load_state_dict(ckpt["optimizer_state"]) scheduler.load_state_dict(ckpt["scheduler_state"]) offset = ckpt["epoch"] else: offset = 0 imenc = nn.DataParallel(imenc) capenc = nn.DataParallel(capenc) metrics = {} assert offset < args.max_epochs for ep in range(offset, args.max_epochs): imenc, capenc, optimizer = train(ep + 1, train_loader, imenc, capenc, optimizer, lossfunc, vocab, args) data = validate(ep + 1, val_loader, imenc, capenc, vocab, args) totalscore = 0 for rank in [1, 5, 10, 20]: totalscore += data["i2c_recall@{}".format(rank)] + data[ "c2i_recall@{}".format(rank)] scheduler.step(totalscore) # save checkpoint ckpt = { "stats": data, "epoch": ep + 1, "encoder_state": imenc.module.state_dict(), "decoder_state": capenc.module.state_dict(), "optimizer_state": optimizer.state_dict(), "scheduler_state": scheduler.state_dict() } if not os.path.exists(args.model_save_path): os.makedirs(args.model_save_path) savepath = os.path.join( args.model_save_path, "epoch_{:04d}_score_{:05d}.ckpt".format(ep + 1, int(100 * totalscore))) print( "saving model and optimizer checkpoint to {} ...".format(savepath), flush=True) torch.save(ckpt, savepath) print("done for epoch {}".format(ep + 1), flush=True) for k, v in data.items(): if k not in metrics.keys(): metrics[k] = [v] else: metrics[k].append(v) visualize(metrics, args)
def main(): args = parse_args() transform = transforms.Compose([ transforms.Resize((args.imsize, args.imsize)), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) if args.dataset == "coco": val_dset = CocoDataset( root=args.root_path, split="val", transform=transform, ) val_loader = DataLoader( val_dset, batch_size=args.batch_size, shuffle=False, num_workers=args.n_cpu, collate_fn=collater, ) vocab = Vocabulary(max_len=args.max_len) vocab.load_vocab(args.vocab_path) imenc = ImageEncoder(args.out_size, args.cnn_type) capenc = CaptionEncoder(len(vocab), args.emb_size, args.out_size, args.rnn_type) device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") imenc = imenc.to(device) capenc = capenc.to(device) assert args.checkpoint is not None print("loading model and optimizer checkpoint from {} ...".format( args.checkpoint), flush=True) ckpt = torch.load(args.checkpoint, map_location=device) imenc.load_state_dict(ckpt["encoder_state"]) capenc.load_state_dict(ckpt["decoder_state"]) begin = time.time() dset = EmbedDataset(val_loader, imenc, capenc, vocab, args) print("database created | {} ".format(sec2str(time.time() - begin)), flush=True) savedir = os.path.join("out", args.config_name) if not os.path.exists(savedir): os.makedirs(savedir, 0o777) image = dset.embedded["image"] caption = dset.embedded["caption"] n_i = image.shape[0] n_c = caption.shape[0] all = np.concatenate([image, caption], axis=0) emb_file = os.path.join(savedir, "embedding_{}.npy".format(n_i)) save_file = os.path.join(savedir, "{}.npy".format(args.method)) vis_file = os.path.join(savedir, "{}.png".format(args.method)) np.save(emb_file, all) print("saved embeddings to {}".format(emb_file), flush=True) dimension_reduction(emb_file, save_file, method=args.method) plot_embeddings(save_file, n_i, vis_file, method=args.method)
def intention_train(context_text_encoder: TextEncoder, context_image_encoder: ImageEncoder, context_encoder: ContextEncoder, train_dataset: Dataset, valid_dataset: Dataset, test_dataset: Dataset, model_file: str): """Intention train. Args: context_text_encoder (TextEncoder): Context text encoder. context_image_encoder (ImageEncoder): Context image encoder. context_encoder (ContextEncoder): Context encoder. train_dataset (Dataset): Train dataset. valid_dataset (Dataset): Valid dataset. test_dataset (Dataset): Test dataset. model_file (str): Saved model file. """ # Data loader. train_data_loader = DataLoader( dataset=train_dataset, batch_size=IntentionTrainConfig.batch_size, shuffle=True, num_workers=IntentionTrainConfig.num_data_loader_workers) # Model. intention_config = IntentionConfig() intention = Intention(intention_config).to(GlobalConfig.device) # Model parameters. params = list( chain.from_iterable([ list(model.parameters()) for model in [ context_text_encoder, context_image_encoder, context_encoder, intention ] ])) optimizer = Adam(params, lr=IntentionTrainConfig.learning_rate) epoch_id = 0 min_valid_loss = None # Load saved state. if isfile(model_file): state = torch.load(model_file) intention.load_state_dict(state['intention']) optimizer.load_state_dict(state['optimizer']) epoch_id = state['epoch_id'] min_valid_loss = state['min_valid_loss'] # Loss. sum_loss = 0 bad_loss_cnt = 0 # Switch to train mode. context_text_encoder.train() context_image_encoder.train() context_encoder.train() intention.train() finished = False for epoch_id in range(epoch_id, IntentionTrainConfig.num_iterations): for batch_id, train_data in enumerate(train_data_loader): # Sets gradients to 0. optimizer.zero_grad() texts, text_lengths, images, utter_types = train_data # Sizes: # texts: (batch_size, dialog_context_size + 1, dialog_text_max_len) # text_lengths: (batch_size, dialog_context_size + 1) # images: (batch_size, dialog_context_size + 1, # pos_images_max_num, 3, image_size, image_size) # utter_types: (batch_size, ) # To device. texts = texts.to(GlobalConfig.device) text_lengths = text_lengths.to(GlobalConfig.device) images = images.to(GlobalConfig.device) utter_types = utter_types.to(GlobalConfig.device) texts.transpose_(0, 1) # (dialog_context_size + 1, batch_size, dialog_text_max_len) text_lengths.transpose_(0, 1) # (dialog_context_size + 1, batch_size) images.transpose_(0, 1) images.transpose_(1, 2) # (dialog_context_size + 1, pos_images_max_num, batch_size, 3, # image_size, image_size) # Encode context. context, _ = encode_context(context_text_encoder, context_image_encoder, context_encoder, texts, text_lengths, images) # (batch_size, context_vector_size) intent_prob = intention(context) # (batch_size, utterance_type_size) loss = nll_loss(intent_prob, utter_types) sum_loss += loss loss.backward() optimizer.step() # Print loss every `TrainConfig.print_freq` batches. if (batch_id + 1) % IntentionTrainConfig.print_freq == 0: cur_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") sum_loss /= IntentionTrainConfig.print_freq print('epoch: {} \tbatch: {} \tloss: {} \ttime: {}'.format( epoch_id + 1, batch_id + 1, sum_loss, cur_time)) sum_loss = 0 # Valid every `TrainConfig.valid_freq` batches. if (batch_id + 1) % IntentionTrainConfig.valid_freq == 0: valid_loss, accuracy = intention_valid(context_text_encoder, context_image_encoder, context_encoder, intention, valid_dataset) cur_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") print('valid_loss: {} \taccuracy: {} \ttime: {}'.format( valid_loss, accuracy, cur_time)) # Save current best model. if min_valid_loss is None or valid_loss < min_valid_loss: min_valid_loss = valid_loss bad_loss_cnt = 0 save_dict = { 'task': INTENTION_TASK, 'epoch_id': epoch_id, 'min_valid_loss': min_valid_loss, 'optimizer': optimizer.state_dict(), 'context_text_encoder': context_text_encoder.state_dict(), 'context_image_encoder': context_image_encoder.state_dict(), 'context_encoder': context_encoder.state_dict(), 'intention': intention.state_dict() } torch.save(save_dict, model_file) print('Best model saved.') else: bad_loss_cnt += 1 if bad_loss_cnt > IntentionTrainConfig.patience: intention_test(context_text_encoder, context_image_encoder, context_encoder, intention, test_dataset) finished = True break if finished: break
test_input = chainer.as_variable(xp.array(test_input_box).astype(xp.float32)) test_frame = chainer.as_variable(xp.array(test_box).astype(xp.float32)) for i, t in enumerate(test_box): tmp=np.clip(t*127.5+127.5,0,255).transpose(1,2,0).astype(xp.uint8) pylab.subplot(4,4,i+1) pylab.imshow(tmp) pylab.axis('off') pylab.savefig('%s/true.png'%(outdir)) test_path="./test.png" test=prepare_image(test_path) test=chainer.as_variable(xp.array(test).astype(xp.float32)) test=F.tile(test,(framesize,1,1,1)) image_encoder=ImageEncoder() image_encoder.to_gpu() enc_opt=set_optimizer(image_encoder) key_point_detector = KeyPointDetector() key_point_detector.to_gpu() key_opt = set_optimizer(key_point_detector) making_optical_flow = UNet(in_ch=4) making_optical_flow.to_gpu() ref_opt=set_optimizer(making_optical_flow) generator = Generator(in_ch=3) generator.to_gpu() gen_opt = set_optimizer(generator)
) val_loader = torch.utils.data.DataLoader( val_data, batch_sampler=batch_sampler_val, num_workers=NUM_WORKERS, pin_memory=True, ) """ 7) Create Model """ VOCAB_SIZE = len(vocab.word2index) IMAGE_EMB_DIM = 512 WORD_EMB_DIM = 512 HIDDEN_DIM = 1024 word_embedding = torch.nn.Embedding( num_embeddings=VOCAB_SIZE, embedding_dim=WORD_EMB_DIM, ) image_encoder = ImageEncoder(out_dim=IMAGE_EMB_DIM) image_decoder = CaptionRNN( num_classes=VOCAB_SIZE, word_emb_dim=WORD_EMB_DIM, img_emb_dim=IMAGE_EMB_DIM, hidden_dim=HIDDEN_DIM ) """ 8) Create Optimizer and Loss Function """ LR = 0.001 WEIGHT_DECAY = 0. loss_fn = torch.nn.NLLLoss() parameters = list(image_decoder.parameters()) + list(word_embedding.parameters()) optim = torch.optim.Adam( params=parameters, lr=LR,
use_bert = True if not os.path.exists(checkpoints_dir) and local_rank == 0: os.makedirs(checkpoints_dir) kdd_dataset = Dataset(use_bert=use_bert) sampler = DistributedSampler(kdd_dataset) loader = DataLoader(kdd_dataset, collate_fn=collate_fn, batch_size=150, sampler=sampler, num_workers=20) nhead = 4 score_model = ScoreModel(kdd_dataset.unknown_token + 1, 1024, 1024, use_bert=use_bert).cuda() image_encoder = ImageEncoder(input_dim=2048, output_dim=1024, nhead=nhead) image_encoder.load_pretrained_weights( path='../user_data/image_encoder_large.pth') image_encoder = image_encoder.cuda() # text_generator = TextGenerator(score_model.embed.num_embeddings).cuda() # score_model = ScoreModel(30522, 256, num_heads=1).cuda() # category_embedding = CategoryEmbedding(256).cuda() optimizer = Adam(score_model.get_params() + image_encoder.get_params()) if start_epoch > 0 and local_rank == 0: checkpoints = torch.load( os.path.join(checkpoints_dir, 'model-epoch{}.pth'.format(start_epoch))) score_model.load_state_dict(checkpoints['score']) image_encoder.load_state_dict(checkpoints['item'])
def valid(epoch=1, checkpoints_dir='./checkpoints', use_bert=False, data_path=None, out_path='../prediction_result/valid_pred.json', output_ndcg=True): print("valid epoch{}".format(epoch)) if data_path is not None: kdd_dataset = ValidDataset(data_path, use_bert=use_bert) else: kdd_dataset = ValidDataset(data_path, use_bert=use_bert) loader = DataLoader(kdd_dataset, collate_fn=collate_fn_valid, batch_size=128, shuffle=False, num_workers=8) tbar = tqdm(loader) text_encoder = TextEncoder(kdd_dataset.unknown_token + 1, 1024, 256, use_bert=use_bert).cuda() image_encoder = ImageEncoder(input_dim=2048, output_dim=1024, nhead=4).cuda() score_model = ScoreModel(1024, 256).cuda() # category_embedding = model.CategoryEmbedding(768).cuda() checkpoints = torch.load( os.path.join(checkpoints_dir, 'model-epoch{}.pth'.format(epoch))) text_encoder.load_state_dict(checkpoints['query']) image_encoder.load_state_dict(checkpoints['item']) score_model.load_state_dict(checkpoints['score']) # score_model.load_state_dict(checkpoints['score']) outputs = {} image_encoder.eval() text_encoder.eval() score_model.eval() for query_id, product_id, query, query_len, features, boxes, category, obj_len in tbar: query, query_len = query.cuda(), query_len.cuda() query, hidden = text_encoder(query, query_len) features, boxes, obj_len = features.cuda(), boxes.cuda(), obj_len.cuda( ) features = image_encoder(features, boxes, obj_len) score = score_model(query, hidden, query_len, features) score = score.data.cpu().numpy() # print(score2) for q_id, p_id, s in zip(query_id.data.numpy(), product_id.data.numpy(), score): outputs.setdefault(str(q_id), []) outputs[str(q_id)].append((p_id, s)) for k, v in outputs.items(): v = sorted(v, key=lambda x: x[1], reverse=True) v = [(str(x[0]), float(x[1])) for x in v] outputs[k] = v with open(out_path, 'w') as f: json.dump(outputs, f) if output_ndcg: pred = read_json(out_path) gt = read_json('../data/valid/valid_answer.json') score = 0 k = 5 for key, val in gt.items(): ground_truth_ids = [str(x) for x in val] predictions = [x[0] for x in pred[key][:k]] ref_vec = [1.0] * len(ground_truth_ids) pred_vec = [ 1.0 if pid in ground_truth_ids else 0.0 for pid in predictions ] score += get_ndcg(pred_vec, ref_vec, k) # print(key) # print([pid for pid in predictions if pid not in ground_truth_ids]) # print('========') # score += len(set(predictions).intersection(ground_truth_ids)) / len(ground_truth_ids) score = score / len(gt) print('ndcg@%d: %.4f' % (k, score)) return score else: return None
def recommend_train(context_text_encoder: TextEncoder, context_image_encoder: ImageEncoder, context_encoder: ContextEncoder, train_dataset: Dataset, valid_dataset: Dataset, test_dataset: Dataset, model_file: str, vocab_size: int, embed_init=None): """Recommend train. Args: context_text_encoder (TextEncoder): Context text encoder. context_image_encoder (ImageEncoder): Context image encoder. context_encoder (ContextEncoder): Context encoder. train_dataset (Dataset): Train dataset. valid_dataset (Dataset): Valid dataset. test_dataset (Dataset): Test dataset. model_file (str): Saved model file. vocab_size (int): Vocabulary size. embed_init: Initial embedding (vocab_size, embed_size). """ # Data loader. train_data_loader = DataLoader( dataset=train_dataset, batch_size=RecommendTrainConfig.batch_size, shuffle=True, num_workers=RecommendTrainConfig.num_data_loader_workers) # Model. similarity_config = SimilarityConfig(vocab_size, embed_init) similarity = Similarity(similarity_config).to(GlobalConfig.device) # Model parameters. params = list( chain.from_iterable([ list(model.parameters()) for model in [ context_text_encoder, context_image_encoder, context_encoder, similarity ] ])) optimizer = Adam(params, lr=RecommendTrainConfig.learning_rate) epoch_id = 0 min_valid_loss = None # Load saved state. if isfile(model_file): state = torch.load(model_file) similarity.load_state_dict(state['similarity']) optimizer.load_state_dict(state['optimizer']) epoch_id = state['epoch_id'] min_valid_loss = state['min_valid_loss'] # Loss. sum_loss = 0 bad_loss_cnt = 0 # Switch to train mode. context_text_encoder.train() context_image_encoder.train() context_encoder.train() similarity.train() finished = False for epoch_id in range(epoch_id, RecommendTrainConfig.num_iterations): for batch_id, train_data in enumerate(train_data_loader): # Sets gradients to 0. optimizer.zero_grad() context_dialog, pos_products, neg_products = train_data texts, text_lengths, images, utter_types = context_dialog # Sizes: # texts: (batch_size, dialog_context_size + 1, dialog_text_max_len) # text_lengths: (batch_size, dialog_context_size + 1) # images: (batch_size, dialog_context_size + 1, # pos_images_max_num, 3, image_size, image_size) # utter_types: (batch_size, ) batch_size = texts.size(0) # To device. texts = texts.to(GlobalConfig.device) text_lengths = text_lengths.to(GlobalConfig.device) images = images.to(GlobalConfig.device) # utter_types = utter_types.to(GlobalConfig.device) texts.transpose_(0, 1) # (dialog_context_size + 1, batch_size, dialog_text_max_len) text_lengths.transpose_(0, 1) # (dialog_context_size + 1, batch_size) images.transpose_(0, 1) images.transpose_(1, 2) # (dialog_context_size + 1, pos_images_max_num, batch_size, 3, # image_size, image_size) # Encode context. context, _ = encode_context(context_text_encoder, context_image_encoder, context_encoder, texts, text_lengths, images) # (batch_size, context_vector_size) loss = recommend_loss(similarity, batch_size, context, pos_products, neg_products) sum_loss += loss loss.backward() optimizer.step() # Print loss every `TrainConfig.print_freq` batches. if (batch_id + 1) % RecommendTrainConfig.print_freq == 0: cur_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") sum_loss /= RecommendTrainConfig.print_freq print('epoch: {} \tbatch: {} \tloss: {} \ttime: {}'.format( epoch_id + 1, batch_id + 1, sum_loss, cur_time)) sum_loss = 0 # Valid every `TrainConfig.valid_freq` batches. if (batch_id + 1) % RecommendTrainConfig.valid_freq == 0: valid_loss = recommend_valid(context_text_encoder, context_image_encoder, context_encoder, similarity, valid_dataset) cur_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") print('valid_loss: {} \ttime: {}'.format(valid_loss, cur_time)) # Save current best model. if min_valid_loss is None or valid_loss < min_valid_loss: min_valid_loss = valid_loss bad_loss_cnt = 0 save_dict = { 'task': RECOMMEND_TASK, 'epoch_id': epoch_id, 'min_valid_loss': min_valid_loss, 'optimizer': optimizer.state_dict(), 'context_text_encoder': context_text_encoder.state_dict(), 'context_image_encoder': context_image_encoder.state_dict(), 'context_encoder': context_encoder.state_dict(), 'similarity': similarity.state_dict() } torch.save(save_dict, model_file) print('Best model saved.') else: bad_loss_cnt += 1 if bad_loss_cnt > RecommendTrainConfig.patience: recommend_test(context_text_encoder, context_image_encoder, context_encoder, similarity, test_dataset) finished = True break if finished: break
val_loader = torch.utils.data.DataLoader( val_data, batch_sampler=batch_sampler_val, num_workers=NUM_WORKERS, pin_memory=True, ) """ 7) Create Model """ VOCAB_SIZE = len(vocab.word2index) IMAGE_EMB_DIM = 256 WORD_EMB_DIM = 256 HIDDEN_DIM = 512 word_embedding = torch.nn.Embedding( num_embeddings=VOCAB_SIZE, embedding_dim=WORD_EMB_DIM, ) image_encoder = ImageEncoder(out_dim=IMAGE_EMB_DIM) image_decoder = CaptionRNN(num_classes=VOCAB_SIZE, word_emb_dim=WORD_EMB_DIM, img_emb_dim=IMAGE_EMB_DIM, hidden_dim=HIDDEN_DIM) word_embedding.eval() image_encoder.eval() image_decoder.eval() """ 9) Load Weights """ LOAD_WEIGHTS = True EMBEDDING_WEIGHT_FILE = 'checkpoints/BIGDATASET-weights-embedding-epoch-3.pt' ENCODER_WEIGHT_FILE = 'checkpoints/BIGDATASET-weights-encoder-epoch-3.pt' DECODER_WEIGHT_FILE = 'checkpoints/BIGDATASET-weights-decoder-epoch-3.pt' if LOAD_WEIGHTS: print("Loading pretrained weights...") word_embedding.load_state_dict(torch.load(EMBEDDING_WEIGHT_FILE))
def knowledge_celebrity_valid( context_text_encoder: TextEncoder, context_image_encoder: ImageEncoder, context_encoder: ContextEncoder, to_hidden: ToHidden, celebrity_memory: Memory, text_decoder: TextDecoder, valid_dataset: Dataset, celebrity_scores, text_length: int): """Knowledge celebrity valid. Args: context_text_encoder (TextEncoder): Context text encoder. context_image_encoder (ImageEncoder): Context image encoder. context_encoder (ContextEncoder): Context encoder. to_hidden (ToHidden): Context to hidden. celebrity_memory (Memory): Celebrity Memory. text_decoder (TextDecoder): Text decoder. valid_dataset (Dataset): Valid dataset. celebrity_scores: Celebrity scores. text_length (int): Text length. """ # Valid dataset loader. valid_data_loader = DataLoader( valid_dataset, batch_size=KnowledgeCelebrityValidConfig.batch_size, shuffle=True, num_workers=KnowledgeCelebrityValidConfig.num_data_loader_workers) sum_loss = 0 num_batches = 0 # Switch to eval mode. context_text_encoder.eval() context_image_encoder.eval() context_encoder.eval() to_hidden.eval() celebrity_memory.eval() text_decoder.eval() with torch.no_grad(): for batch_id, valid_data in enumerate(valid_data_loader): # Only valid `ValidConfig.num_batches` batches. if batch_id >= KnowledgeCelebrityValidConfig.num_batches: break num_batches += 1 texts, text_lengths, images, utter_types = valid_data # Sizes: # texts: (batch_size, dialog_context_size + 1, dialog_text_max_len) # text_lengths: (batch_size, dialog_context_size + 1) # images: (batch_size, dialog_context_size + 1, # pos_images_max_num, 3, image_size, image_size) # utter_types: (batch_size, ) # To device. texts = texts.to(GlobalConfig.device) text_lengths = text_lengths.to(GlobalConfig.device) images = images.to(GlobalConfig.device) # utter_types = utter_types.to(GlobalConfig.device) texts.transpose_(0, 1) # (dialog_context_size + 1, batch_size, dialog_text_max_len) text_lengths.transpose_(0, 1) # (dialog_context_size + 1, batch_size) images.transpose_(0, 1) images.transpose_(1, 2) # (dialog_context_size + 1, pos_images_max_num, batch_size, 3, # image_size, image_size) # Encode context. context, hiddens = encode_context(context_text_encoder, context_image_encoder, context_encoder, texts, text_lengths, images) # (batch_size, context_vector_size) knowledge_entry = celebrity_scores encode_knowledge_func = partial(celebrity_memory, knowledge_entry) loss, n_totals = text_loss(to_hidden, text_decoder, text_length, context, texts[-1], text_lengths[-1], hiddens, encode_knowledge_func) sum_loss += loss / text_length # Switch to train mode. context_text_encoder.train() context_image_encoder.train() context_encoder.train() to_hidden.train() celebrity_memory.train() text_decoder.train() return sum_loss / num_batches
def load_network(self): image_generator = ImageGenerator() image_generator.apply(weights_init) disc_image = DiscriminatorImage() disc_image.apply(weights_init) emb_dim = 300 text_encoder = TextEncoder(emb_dim, self.txt_emb, 1, dropout=0.0) attn_model = 'general' text_generator = TextGenerator(attn_model, emb_dim, len(self.txt_dico.id2word), self.txt_emb, n_layers=1, dropout=0.0) image_encoder = ImageEncoder() image_encoder.apply(weights_init) disc_latent = DiscriminatorLatent(emb_dim) if cfg.NET_G != '': state_dict = \ torch.load(cfg.NET_G, map_location=lambda storage, loc: storage) netG.load_state_dict(state_dict) print('Load from: ', cfg.NET_G) if cfg.NET_D != '': state_dict = \ torch.load(cfg.NET_D, map_location=lambda storage, loc: storage) netD.load_state_dict(state_dict) print('Load from: ', cfg.NET_D) if cfg.ENCODER != '': state_dict = \ torch.load(cfg.ENCODER, map_location=lambda storage, loc: storage) encoder.load_state_dict(state_dict) print('Load from: ', cfg.ENCODER) if cfg.DECODER != '': state_dict = \ torch.load(cfg.DECODER, map_location=lambda storage, loc: storage) decoder.load_state_dict(state_dict) print('Load from: ', cfg.DECODER) if cfg.IMAGE_ENCODER != '': state_dict = \ torch.load(cfg.IMAGE_ENCODER, map_location=lambda storage, loc: storage) image_encoder.load_state_dict(state_dict) print('Load from: ', cfg.IMAGE_ENCODER) if cfg.CUDA: image_encoder.cuda() image_generator.cuda() text_encoder.cuda() text_generator.cuda() disc_image.cuda() disc_latent.cuda() return image_encoder, image_generator, text_encoder, text_generator, disc_image, disc_latent
def knowledge_attribute_train(context_text_encoder: TextEncoder, context_image_encoder: ImageEncoder, context_encoder: ContextEncoder, train_dataset: Dataset, valid_dataset: Dataset, test_dataset: Dataset, model_file: str, attribute_data: AttributeData, vocab: Dict[str, int], embed_init=None): """Knowledge styletip train. Args: context_text_encoder (TextEncoder): Context text encoder. context_image_encoder (ImageEncoder): Context image encoder. context_encoder (ContextEncoder): Context encoder. train_dataset (Dataset): Train dataset. valid_dataset (Dataset): Valid dataset. test_dataset (Dataset): Test dataset. model_file (str): Saved model file. attribute_data (AttributeData): Attribute data. vocab (Dict[str, int]): Vocabulary. embed_init: Initial embedding (vocab_size, embed_size). """ # Data loader. train_data_loader = DataLoader( dataset=train_dataset, batch_size=KnowledgeAttributeTrainConfig.batch_size, shuffle=True, num_workers=KnowledgeAttributeTrainConfig.num_data_loader_workers) # Model. vocab_size = len(vocab) attribute_kv_memory_config = AttributeKVMemoryConfig( len(attribute_data.key_vocab), len(attribute_data.value_vocab)) text_decoder_config = KnowledgeTextDecoderConfig(vocab_size, MemoryConfig.memory_size, MemoryConfig.output_size, embed_init) to_hidden = ToHidden(text_decoder_config) to_hidden = to_hidden.to(GlobalConfig.device) attribute_kv_memory = KVMemory(attribute_kv_memory_config) attribute_kv_memory = attribute_kv_memory.to(GlobalConfig.device) text_decoder = TextDecoder(text_decoder_config) text_decoder = text_decoder.to(GlobalConfig.device) # Model parameters. params = list( chain.from_iterable([ list(model.parameters()) for model in [ context_text_encoder, context_image_encoder, context_encoder, to_hidden, attribute_kv_memory, text_decoder ] ])) optimizer = Adam(params, lr=KnowledgeAttributeTrainConfig.learning_rate) epoch_id = 0 min_valid_loss = None # Load saved state. if isfile(model_file): state = torch.load(model_file) to_hidden.load_state_dict(state['to_hidden']) attribute_kv_memory.load_state_dict(state['attribute_kv_memory']) text_decoder.load_state_dict(state['text_decoder']) optimizer.load_state_dict(state['optimizer']) epoch_id = state['epoch_id'] min_valid_loss = state['min_valid_loss'] # Loss. sum_loss = 0 bad_loss_cnt = 0 # Switch to train mode. context_text_encoder.train() context_image_encoder.train() context_encoder.train() to_hidden.train() attribute_kv_memory.train() text_decoder.train() finished = False for epoch_id in range(epoch_id, KnowledgeAttributeTrainConfig.num_iterations): for batch_id, train_data in enumerate(train_data_loader): # Set gradients to 0. optimizer.zero_grad() train_data, products = train_data keys, values, pair_length = products keys = keys.to(GlobalConfig.device) values = values.to(GlobalConfig.device) pair_length = pair_length.to(GlobalConfig.device) texts, text_lengths, images, utter_types = train_data # Sizes: # texts: (batch_size, dialog_context_size + 1, dialog_text_max_len) # text_lengths: (batch_size, dialog_context_size + 1) # images: (batch_size, dialog_context_size + 1, # pos_images_max_num, 3, image_size, image_size) # utter_types: (batch_size, ) # To device. texts = texts.to(GlobalConfig.device) text_lengths = text_lengths.to(GlobalConfig.device) images = images.to(GlobalConfig.device) utter_types = utter_types.to(GlobalConfig.device) texts.transpose_(0, 1) # (dialog_context_size + 1, batch_size, dialog_text_max_len) text_lengths.transpose_(0, 1) # (dialog_context_size + 1, batch_size) images.transpose_(0, 1) images.transpose_(1, 2) # (dialog_context_size + 1, pos_images_max_num, batch_size, 3, # image_size, image_size) # Encode context. context, hiddens = encode_context(context_text_encoder, context_image_encoder, context_encoder, texts, text_lengths, images) # (batch_size, context_vector_size) encode_knowledge_func = partial(attribute_kv_memory, keys, values, pair_length) loss, n_totals = text_loss(to_hidden, text_decoder, text_decoder_config.text_length, context, texts[-1], text_lengths[-1], hiddens, encode_knowledge_func) sum_loss += loss / text_decoder_config.text_length loss.backward() optimizer.step() # Print loss every `TrainConfig.print_freq` batches. if (batch_id + 1) % KnowledgeAttributeTrainConfig.print_freq == 0: cur_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") sum_loss /= KnowledgeAttributeTrainConfig.print_freq print('epoch: {} \tbatch: {} \tloss: {} \ttime: {}'.format( epoch_id + 1, batch_id + 1, sum_loss, cur_time)) sum_loss = 0 # Valid every `TrainConfig.valid_freq` batches. if (batch_id + 1) % KnowledgeAttributeTrainConfig.valid_freq == 0: valid_loss = knowledge_attribute_valid( context_text_encoder, context_image_encoder, context_encoder, to_hidden, attribute_kv_memory, text_decoder, valid_dataset, text_decoder_config.text_length) cur_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") print('valid_loss: {} \ttime: {}'.format(valid_loss, cur_time)) # Save current best model. if min_valid_loss is None or valid_loss < min_valid_loss: min_valid_loss = valid_loss bad_loss_cnt = 0 save_dict = { 'task': KNOWLEDGE_ATTRIBUTE_SUBTASK, 'epoch_id': epoch_id, 'min_valid_loss': min_valid_loss, 'optimizer': optimizer.state_dict(), 'context_text_encoder': context_text_encoder.state_dict(), 'context_image_encoder': context_image_encoder.state_dict(), 'context_encoder': context_encoder.state_dict(), 'to_hidden': to_hidden.state_dict(), 'attribute_kv_memory': attribute_kv_memory.state_dict(), 'text_decoder': text_decoder.state_dict() } torch.save(save_dict, model_file) print('Best model saved.') else: bad_loss_cnt += 1 if bad_loss_cnt > KnowledgeAttributeTrainConfig.patience: knowledge_attribute_test( context_text_encoder, context_image_encoder, context_encoder, to_hidden, attribute_kv_memory, text_decoder, test_dataset, text_decoder_config.text_length, vocab) finished = True break if finished: break
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing train_transform = transforms.Compose([ transforms.RandomCrop(args.image_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) # val_transform = transforms.Compose([ # transforms.Resize(args.image_size, interpolation=Image.LANCZOS), # transforms.ToTensor(), # transforms.Normalize((0.485, 0.456, 0.406), # (0.229, 0.224, 0.225))]) # Load vocabulary wrapper. with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader train_data_loader = get_loader(args.train_image_dir, args.train_vqa_path, args.ix_to_ans_file, args.train_description_file, vocab, train_transform, args.batch_size, shuffle=True, num_workers=args.num_workers) #val_data_loader = get_loader(args.val_image_dir, args.val_vqa_path, args.ix_to_ans_file, vocab, val_transform, args.batch_size, shuffle=False, num_workers=args.num_workers) image_encoder = ImageEncoder(args.img_feature_size) question_emb_size = 1024 # description_emb_size = 512 no_ans = 1000 question_encoder = BertEncoder(question_emb_size) # ques_description_encoder = BertEncoder(description_emb_size) # vqa_decoder = VQA_Model(args.img_feature_size, question_emb_size, description_emb_size, no_ans) vqa_decoder = VQA_Model(args.img_feature_size, question_emb_size, no_ans) pretrained_epoch = 0 if args.pretrained_epoch > 0: pretrained_epoch = args.pretrained_epoch image_encoder.load_state_dict(torch.load('./models/image_encoder-' + str(pretrained_epoch) + '.pkl')) question_encoder.load_state_dict(torch.load('./models/question_encoder-' + str(pretrained_epoch) + '.pkl')) # ques_description_encoder.load_state_dict(torch.load('./models/ques_description_encoder-' + str(pretrained_epoch) + '.pkl')) vqa_decoder.load_state_dict(torch.load('./models/vqa_decoder-' + str(pretrained_epoch) + '.pkl')) if torch.cuda.is_available(): image_encoder.cuda() question_encoder.cuda() # ques_description_encoder.cuda() vqa_decoder.cuda() print("Cuda is enabled...") criterion = nn.CrossEntropyLoss() # params = image_encoder.get_params() + question_encoder.get_params() + ques_description_encoder.get_params() + vqa_decoder.get_params() params = list(image_encoder.parameters()) + list(question_encoder.parameters()) + list(vqa_decoder.parameters()) #print("params: ", params) optimizer = torch.optim.Adam(params, lr=args.learning_rate, weight_decay=args.weight_decay) total_train_step = len(train_data_loader) min_avg_loss = float("inf") overfit_warn = 0 for epoch in range(args.num_epochs): if epoch < pretrained_epoch: continue image_encoder.train() question_encoder.train() #ques_description_encoder.train() vqa_decoder.train() avg_loss = 0.0 avg_acc = 0.0 for bi, (question_arr, image_vqa, target_answer, answer_str) in enumerate(train_data_loader): loss = 0 image_encoder.zero_grad() question_encoder.zero_grad() #ques_description_encoder.zero_grad() vqa_decoder.zero_grad() images = to_var(torch.stack(image_vqa)) question_arr = to_var(torch.stack(question_arr)) #ques_desc_arr = to_var(torch.stack(ques_desc_arr)) target_answer = to_var(torch.tensor(target_answer)) image_emb = image_encoder(images) question_emb = question_encoder(question_arr) #ques_desc_emb = ques_description_encoder(ques_desc_arr) #output = vqa_decoder(image_emb, question_emb, ques_desc_emb) output = vqa_decoder(image_emb, question_emb) loss = criterion(output, target_answer) _, prediction = torch.max(output,1) no_correct_prediction = prediction.eq(target_answer).sum().item() accuracy = no_correct_prediction * 100/ args.batch_size #### target_answer_no = target_answer.tolist() prediction_no = prediction.tolist() #### loss_num = loss.item() avg_loss += loss.item() avg_acc += no_correct_prediction #loss /= (args.batch_size) loss.backward() optimizer.step() # Print log info if bi % args.log_step == 0: print('Epoch [%d/%d], Train Step [%d/%d], Loss: %.4f, Acc: %.4f' %(epoch + 1, args.num_epochs, bi, total_train_step, loss.item(), accuracy)) avg_loss /= (args.batch_size * total_train_step) avg_acc /= (args.batch_size * total_train_step) print('Epoch [%d/%d], Average Train Loss: %.4f, Average Train acc: %.4f' %(epoch + 1, args.num_epochs, avg_loss, avg_acc)) # Save the models torch.save(image_encoder.state_dict(), os.path.join(args.model_path, 'image_encoder-%d.pkl' %(epoch+1))) torch.save(question_encoder.state_dict(), os.path.join(args.model_path, 'question_encoder-%d.pkl' %(epoch+1))) #torch.save(ques_description_encoder.state_dict(), os.path.join(args.model_path, 'ques_description_encoder-%d.pkl' %(epoch+1))) torch.save(vqa_decoder.state_dict(), os.path.join(args.model_path, 'vqa_decoder-%d.pkl' %(epoch+1))) overfit_warn = overfit_warn + 1 if (min_avg_loss < avg_loss) else 0 min_avg_loss = min(min_avg_loss, avg_loss) lossFileName = "result/result_"+str(epoch)+".txt" test_fd = open(lossFileName, 'w') test_fd.write('Epoch: '+ str(epoch) + ' avg_loss: ' + str(avg_loss)+ " avg_acc: "+ str(avg_acc)+"\n") test_fd.close() if overfit_warn >= 5: print("terminated as overfitted") break
def intention_test(context_text_encoder: TextEncoder, context_image_encoder: ImageEncoder, context_encoder: ContextEncoder, intention: Intention, test_dataset: Dataset): """Intention test. Args: context_text_encoder (TextEncoder): Context text encoder. context_image_encoder (ImageEncoder): Context image encoder. context_encoder (ContextEncoder): Context encoder. intention (Intention): Intention. test_dataset (Dataset): Test dataset. """ # Test dataset loader. test_data_loader = DataLoader( test_dataset, batch_size=IntentionTestConfig.batch_size, shuffle=False, num_workers=IntentionTestConfig.num_data_loader_workers) sum_accuracy = 0 # Switch to eval mode. context_text_encoder.eval() context_image_encoder.eval() context_encoder.eval() intention.eval() with torch.no_grad(): for batch_id, valid_data in enumerate(test_data_loader): texts, text_lengths, images, utter_types = valid_data # Sizes: # texts: (batch_size, dialog_context_size + 1, dialog_text_max_len) # text_lengths: (batch_size, dialog_context_size + 1) # images: (batch_size, dialog_context_size + 1, # pos_images_max_num, 3, image_size, image_size) # utter_types: (batch_size, ) # To device. texts = texts.to(GlobalConfig.device) text_lengths = text_lengths.to(GlobalConfig.device) images = images.to(GlobalConfig.device) utter_types = utter_types.to(GlobalConfig.device) texts.transpose_(0, 1) # (dialog_context_size + 1, batch_size, dialog_text_max_len) text_lengths.transpose_(0, 1) # (dialog_context_size + 1, batch_size) images.transpose_(0, 1) images.transpose_(1, 2) # (dialog_context_size + 1, pos_images_max_num, batch_size, 3, # image_size, image_size) # Encode context. context, _ = encode_context(context_text_encoder, context_image_encoder, context_encoder, texts, text_lengths, images) # (batch_size, context_vector_size) intent_prob = intention(context) # (batch_size, utterance_type_size) intentions = torch.argmax(intent_prob, dim=1) eqs = torch.eq(intentions, utter_types) num_correct = torch.sum(eqs).item() accuracy = num_correct * 1.0 / eqs.size(0) sum_accuracy += accuracy # Print. print('pred:', intentions) print('true:', utter_types) print('# correct:', num_correct) print('accuracy:', accuracy) print('total accuracy:', sum_accuracy / (batch_id + 1))
def main(): # ignore warnings #warnings.simplefilter('ignore') args = get_arguments() SETTING = Dict(yaml.safe_load(open(os.path.join('arguments',args.arg+'.yaml'), encoding='utf8'))) print(args) args.device = list (map(str,args.device)) os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(args.device) #image transformer train_transform = transforms.Compose([ transforms.Resize(SETTING.imsize_pre), transforms.RandomCrop(SETTING.imsize), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) val_transform = transforms.Compose([ transforms.Resize(SETTING.imsize_pre), transforms.CenterCrop(SETTING.imsize), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) # data load if args.dataset == 'coco': train_dset = CocoDset(root=SETTING.root_path,img_dir='train2017', ann_dir='annotations/captions_train2017.json', transform=train_transform) val_dset = CocoDset(root=SETTING.root_path, img_dir='val2017', ann_dir='annotations/captions_val2017.json', transform=val_transform) train_loader = DataLoader(train_dset, batch_size=SETTING.batch_size, shuffle=True, num_workers=SETTING.n_cpu, collate_fn=collater) val_loader = DataLoader(val_dset, batch_size=SETTING.batch_size, shuffle=False, num_workers=SETTING.n_cpu, collate_fn=collater) # setup vocab dict vocab = Vocabulary(max_len=SETTING.max_len) vocab.load_vocab(args.vocab_path) # setup encoder imenc = ImageEncoder(SETTING.out_size, SETTING.cnn_type) capenc = CaptionEncoder(len(vocab), SETTING.emb_size, SETTING.out_size, SETTING.rnn_type, vocab.padidx) device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") imenc = imenc.to(device) capenc = capenc.to(device) # learning rate cfgs = [{'params' : imenc.fc.parameters(), 'lr' : float(SETTING.lr_cnn)}, {'params' : capenc.parameters(), 'lr' : float(SETTING.lr_rnn)}] # optimizer if SETTING.optimizer == 'SGD': optimizer = optim.SGD(cfgs, momentum=SETTING.momentum, weight_decay=SETTING.weight_decay) elif SETTING.optimizer == 'Adam': optimizer = optim.Adam(cfgs, betas=(SETTING.beta1, SETTING.beta2), weight_decay=SETTING.weight_decay) elif SETTING.optimizer == 'RMSprop': optimizer = optim.RMSprop(cfgs, alpha=SETTING.alpha, weight_decay=SETTING.weight_decay) if SETTING.scheduler == 'Plateau': scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=SETTING.dampen_factor, patience=SETTING.patience, verbose=True) elif SETTING.scheduler == 'Step': scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=SETTING.patience, gamma=SETTING.dampen_factor) # loss lossfunc = PairwiseRankingLoss(margin=SETTING.margin, method=SETTING.method, improved=args.improved, intra=SETTING.intra, lamb=SETTING.imp_weight) # if start from checkpoint if args.checkpoint is not None: print("loading model and optimizer checkpoint from {} ...".format(args.checkpoint), flush=True) ckpt = torch.load(args.checkpoint) imenc.load_state_dict(ckpt["encoder_state"]) capenc.load_state_dict(ckpt["decoder_state"]) optimizer.load_state_dict(ckpt["optimizer_state"]) if SETTING.scheduler != 'None': scheduler.load_state_dict(ckpt["scheduler_state"]) offset = ckpt["epoch"] data = ckpt["stats"] bestscore = 0 for rank in [1, 5, 10, 20]: bestscore += data["i2c_recall@{}".format(rank)] + data["c2i_recall@{}".format(rank)] bestscore = int(bestscore) # start new training else: offset = 0 bestscore = -1 if args.dataparallel: print("Using Multiple GPU . . . ") imenc = nn.DataParallel(imenc) capenc = nn.DataParallel(capenc) metrics = {} es_cnt = 0 # training assert offset < SETTING.max_epochs for ep in range(offset, SETTING.max_epochs): epoch = ep+1 # unfreeze cnn parameters if epoch == SETTING.freeze_epoch: if args.dataparallel: optimizer.add_param_group({'params': imenc.module.cnn.parameters(), 'lr': float(SETTING.lr_cnn)}) else: optimizer.add_param_group({'params': imenc.cnn.parameters(), 'lr': float(SETTING.lr_cnn)}) #train(1epoch) train(epoch, train_loader, imenc, capenc, optimizer, lossfunc, vocab, args, SETTING) #validate data = validate(epoch, val_loader, imenc, capenc, vocab, args, SETTING) totalscore = 0 for rank in [1, 5, 10, 20]: totalscore += data["i2c_recall@{}".format(rank)] + data["c2i_recall@{}".format(rank)] totalscore = int(totalscore) #scheduler update if SETTING.scheduler == 'Plateau': scheduler.step(totalscore) if SETTING.scheduler == 'Step': scheduler.step() # update checkpoint if args.dataparallel: ckpt = { "stats": data, "epoch": epoch, "encoder_state": imenc.module.state_dict(), "decoder_state": capenc.module.state_dict(), "optimizer_state": optimizer.state_dict() } else: ckpt = { "stats": data, "epoch": epoch, "encoder_state": imenc.state_dict(), "decoder_state": capenc.state_dict(), "optimizer_state": optimizer.state_dict() } if SETTING.scheduler != 'None': ckpt['scheduler_state'] = scheduler.state_dict() # make savedir savedir = os.path.join("models", args.arg) if not os.path.exists(savedir): os.makedirs(savedir) # for k, v in data.items(): if k not in metrics.keys(): metrics[k] = [v] else: metrics[k].append(v) # save checkpoint savepath = os.path.join(savedir, "epoch_{:04d}_score_{:03d}.ckpt".format(epoch, totalscore)) if int(totalscore) > int(bestscore): print("score: {:03d}, saving model and optimizer checkpoint to {} ...".format(totalscore, savepath), flush=True) bestscore = totalscore torch.save(ckpt, savepath) es_cnt = 0 else: print("score: {:03d}, no improvement from best score of {:03d}, not saving".format(totalscore, bestscore), flush=True) es_cnt += 1 # early stopping if es_cnt == SETTING.es_cnt: print("early stopping at epoch {} because of no improvement for {} epochs".format(epoch, SETTING.es_cnt)) break print("done for epoch {:04d}".format(epoch), flush=True) visualize(metrics, args, SETTING) print("complete training")