def build_decoder(n_vocabs): model = Decoder(model_name=C.decoder_model, n_layers=C.decoder_n_layers, encoder_size=C.encoder_output_size, embedding_size=C.embedding_size, embedding_scale=C.embedding_scale, hidden_size=C.decoder_hidden_size, attn_size=C.decoder_attn_size, output_size=n_vocabs, embedding_dropout=C.embedding_dropout, dropout=C.decoder_dropout, out_dropout=C.decoder_out_dropout) model = model.to(C.device) loss = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=C.decoder_learning_rate, weight_decay=C.decoder_weight_decay, amsgrad=C.decoder_use_amsgrad) lambda_reg = torch.autograd.Variable(torch.tensor(0.001), requires_grad=True) lambda_reg = lambda_reg.to(C.device) decoder = { 'model': model, 'loss': loss, 'optimizer': optimizer, 'lambda_reg': lambda_reg, } return decoder
def detect(path, encoder=None, decoder=None): torch.backends.cudnn.benchmark = True dataset = LoadImages(path, img_size=config.IMAGE_SIZE, used_layers=config.USED_LAYERS) if not encoder or not decoder: in_channels = num_channels(config.USED_LAYERS) encoder = Encoder(in_channels=in_channels) decoder = Decoder(num_classes=config.NUM_CLASSES+1) encoder = encoder.to(config.DEVICE) decoder = decoder.to(config.DEVICE) _, encoder, decoder = load_checkpoint(encoder, decoder, config.CHECKPOINT_FILE, config.DEVICE) encoder.eval() decoder.eval() for _, layers, path in dataset: with torch.no_grad(): layers = torch.from_numpy(layers).to(config.DEVICE, non_blocking=True) if layers.ndimension() == 3: layers = layers.unsqueeze(0) features = encoder(layers) predictions = decoder(features) _, out = predictions, predictions.sigmoid() plot_volumes(to_volume(out, config.VOXEL_THRESH).cpu(), [path], config.NAMES)
def run(ckpt_fpath): checkpoint = torch.load(ckpt_fpath) """ Load Config """ config = dict_to_cls(checkpoint['config']) """ Build Data Loader """ if config.corpus == "MSVD": corpus = MSVD(config) elif config.corpus == "MSR-VTT": corpus = MSRVTT(config) train_iter, val_iter, test_iter, vocab = \ corpus.train_data_loader, corpus.val_data_loader, corpus.test_data_loader, corpus.vocab print( '#vocabs: {} ({}), #words: {} ({}). Trim words which appear less than {} times.' .format(vocab.n_vocabs, vocab.n_vocabs_untrimmed, vocab.n_words, vocab.n_words_untrimmed, config.loader.min_count)) """ Build Models """ decoder = Decoder(rnn_type=config.decoder.rnn_type, num_layers=config.decoder.rnn_num_layers, num_directions=config.decoder.rnn_num_directions, feat_size=config.feat.size, feat_len=config.loader.frame_sample_len, embedding_size=config.vocab.embedding_size, hidden_size=config.decoder.rnn_hidden_size, attn_size=config.decoder.rnn_attn_size, output_size=vocab.n_vocabs, rnn_dropout=config.decoder.rnn_dropout) decoder.load_state_dict(checkpoint['decoder']) model = CaptionGenerator(decoder, config.loader.max_caption_len, vocab) model = model.cuda() """ Train Set """ """ train_vid2pred = get_predicted_captions(train_iter, model, model.vocab, beam_width=5, beam_alpha=0.) train_vid2GTs = get_groundtruth_captions(train_iter, model.vocab) train_scores = score(train_vid2pred, train_vid2GTs) print("[TRAIN] {}".format(train_scores)) """ """ Validation Set """ """ val_vid2pred = get_predicted_captions(val_iter, model, model.vocab, beam_width=5, beam_alpha=0.) val_vid2GTs = get_groundtruth_captions(val_iter, model.vocab) val_scores = score(val_vid2pred, val_vid2GTs) print("[VAL] scores: {}".format(val_scores)) """ """ Test Set """ test_vid2pred = get_predicted_captions(test_iter, model, model.vocab, beam_width=5, beam_alpha=0.) test_vid2GTs = get_groundtruth_captions(test_iter, model.vocab) test_scores = score(test_vid2pred, test_vid2GTs) print("[TEST] {}".format(test_scores)) test_save_fpath = os.path.join(C.result_dpath, "{}_{}.csv".format(config.corpus, 'test')) save_result(test_vid2pred, test_vid2GTs, test_save_fpath)
def evaluate_hand_draw_net(cfg): # Enable the inbuilt cudnn auto-tuner to find the best algorithm to use torch.backends.cudnn.benchmark = True IMG_SIZE = cfg.CONST.IMG_H, cfg.CONST.IMG_W CROP_SIZE = cfg.CONST.CROP_IMG_H, cfg.CONST.CROP_IMG_W eval_transforms = utils.data_transforms.Compose([ utils.data_transforms.CenterCrop(IMG_SIZE, CROP_SIZE), utils.data_transforms.RandomBackground(cfg.TEST.RANDOM_BG_COLOR_RANGE), utils.data_transforms.Normalize(mean=cfg.DATASET.MEAN, std=cfg.DATASET.STD), utils.data_transforms.ToTensor(), ]) # Set up networks encoder = Encoder(cfg) decoder = Decoder(cfg) azi_classes, ele_classes = int(360 / cfg.CONST.BIN_SIZE), int( 180 / cfg.CONST.BIN_SIZE) view_estimater = ViewEstimater(cfg, azi_classes=azi_classes, ele_classes=ele_classes) if torch.cuda.is_available(): encoder = torch.nn.DataParallel(encoder).cuda() decoder = torch.nn.DataParallel(decoder).cuda() view_estimater = torch.nn.DataParallel(view_estimater).cuda() # Load weight # Load weight for encoder, decoder print('[INFO] %s Loading reconstruction weights from %s ...' % (dt.now(), cfg.EVALUATE_HAND_DRAW.RECONSTRUCTION_WEIGHTS)) rec_checkpoint = torch.load(cfg.EVALUATE_HAND_DRAW.RECONSTRUCTION_WEIGHTS) encoder.load_state_dict(rec_checkpoint['encoder_state_dict']) decoder.load_state_dict(rec_checkpoint['decoder_state_dict']) print('[INFO] Best reconstruction result at epoch %d ...' % rec_checkpoint['epoch_idx']) # Load weight for view estimater print('[INFO] %s Loading view estimation weights from %s ...' % (dt.now(), cfg.EVALUATE_HAND_DRAW.VIEW_ESTIMATION_WEIGHTS)) view_checkpoint = torch.load( cfg.EVALUATE_HAND_DRAW.VIEW_ESTIMATION_WEIGHTS) view_estimater.load_state_dict( view_checkpoint['view_estimator_state_dict']) print('[INFO] Best view estimation result at epoch %d ...' % view_checkpoint['epoch_idx']) for img_path in os.listdir(cfg.EVALUATE_HAND_DRAW.INPUT_IMAGE_FOLDER): eval_id = int(img_path[:-4]) input_img_path = os.path.join( cfg.EVALUATE_HAND_DRAW.INPUT_IMAGE_FOLDER, img_path) print(input_img_path) evaluate_hand_draw_img(cfg, encoder, decoder, view_estimater, input_img_path, eval_transforms, eval_id)
def __init__(self, config): super().__init__() self.encoder_word = Encoder(config, config.src_vocab_size) self.encoder_char = Encoder(config, config.tgt_vocab_size) self.pointer = Pointer(config) self.attention = Luong_Attention(config) self.decoder = Decoder(config) self.linear_out = nn.Linear(config.model_size, config.tgt_vocab_size) self.softmax = nn.Softmax(dim=-1) self.s_len = config.s_len self.bos = config.bos
class Visualization_demo(): def __init__(self, cfg, output_dir): self.encoder = Encoder(cfg) self.decoder = Decoder(cfg) self.refiner = Refiner(cfg) self.merger = Merger(cfg) checkpoint = torch.load(cfg.CHECKPOINT) encoder_state_dict = clean_state_dict(checkpoint['encoder_state_dict']) self.encoder.load_state_dict(encoder_state_dict) decoder_state_dict = clean_state_dict(checkpoint['decoder_state_dict']) self.decoder.load_state_dict(decoder_state_dict) if cfg.NETWORK.USE_REFINER: refiner_state_dict = clean_state_dict( checkpoint['refiner_state_dict']) self.refiner.load_state_dict(refiner_state_dict) if cfg.NETWORK.USE_MERGER: merger_state_dict = clean_state_dict( checkpoint['merger_state_dict']) self.merger.load_state_dict(merger_state_dict) if not os.path.exists(output_dir): os.makedirs(output_dir) self.output_dir = output_dir def run_on_images(self, imgs, sid, mid, iid, sampled_idx): dir1 = os.path.join(output_dir, str(sid), str(mid)) if not os.path.exists(dir1): os.makedirs(dir1) deprocess = imagenet_deprocess(rescale_image=False) image_features = self.encoder(imgs) raw_features, generated_volume = self.decoder(image_features) generated_volume = self.merger(raw_features, generated_volume) generated_volume = self.refiner(generated_volume) mesh = cubify(generated_volume, 0.3) # mesh = voxel_to_world(meshes) save_mesh = os.path.join(dir1, "%s_%s.obj" % (iid, sampled_idx)) verts, faces = mesh.get_mesh_verts_faces(0) save_obj(save_mesh, verts, faces) generated_volume = generated_volume.squeeze() img = image_to_numpy(deprocess(imgs[0][0])) save_img = os.path.join(dir1, "%02d.png" % (iid)) # cv2.imwrite(save_img, img[:, :, ::-1]) cv2.imwrite(save_img, img) img1 = image_to_numpy(deprocess(imgs[0][1])) save_img1 = os.path.join(dir1, "%02d.png" % (sampled_idx)) cv2.imwrite(save_img1, img1) # cv2.imwrite(save_img1, img1[:, :, ::-1]) get_volume_views(generated_volume, dir1, iid, sampled_idx)
def __init__(self, vocabulary_size, sos_token, eos_token, pad_token, max_string_length=default_eda['string_max_length'], attention_size=default_attention['size'], embedding_size=default_embedding['size'], hidden_size=default_gru['hidden_size'], num_layers=default_gru['num_layers'], dropout=default_gru['dropout']): super().__init__() self.max_string_length = max_string_length self.attention_size = attention_size self.vocabulary_size = vocabulary_size self.encoder = Encoder(vocabulary_size, embedding_size, hidden_size, num_layers, dropout) self.decoder = Decoder(vocabulary_size, embedding_size, hidden_size, num_layers, dropout, attention_size, pad_token) self.sos_token = sos_token self.eos_token = eos_token
def __init__(self, num_classes, in_channels=3, backbone='xception', pretrained=True, output_stride=16, freeze_bn=False, **_): super(DeepLabV3Plus, self).__init__() assert ('xception' or 'resnet' in backbone) self.backbone, low_level_channels = getBackBone(backbone, in_channels=in_channels, output_stride=output_stride, pretrained=pretrained) self.ASSP = ASSP(in_channels=2048, output_stride=output_stride) self.decoder = Decoder(low_level_channels, num_classes) if freeze_bn: self.freeze_bn()
def __init__(self, config): super().__init__() self.encoder = Encoder(config) self.decoder = Decoder(config) self.bert = Bert(config) self.decoder_ae = Decoder(config) self.t_len = config.t_len self.s_len = config.s_len self.pad = config.pad self.bos = config.bos self.model_size = config.model_size self.linear_bert = nn.Linear(768, config.model_size) self.linear_out = nn.Linear(config.model_size, config.tgt_vocab_size) self.linear_ae = nn.Linear(config.model_size, config.tgt_vocab_size)
def test_train_method(self): file_name = 'test/test_data/attention_test.txt' fine_tune_model_name = '../models/glove_model_40.pth' self.test_data_loader_attention = DataLoaderAttention( file_name=file_name) self.test_data_loader_attention.load_data() source2index, index2source, target2index, index2target, train_data = \ self.test_data_loader_attention.load_data() EMBEDDING_SIZE = 50 HIDDEN_SIZE = 32 encoder = Encoder(len(source2index), EMBEDDING_SIZE, HIDDEN_SIZE, 3, True) decoder = Decoder(len(target2index), EMBEDDING_SIZE, HIDDEN_SIZE * 2) self.trainer = Trainer(fine_tune_model=fine_tune_model_name) self.trainer.train_attention( train_data=train_data, source2index=source2index, target2index=target2index, index2source=index2source, index2target=index2target, encoder_model=encoder, decoder_model=decoder, )
def __init__(self, config): super().__init__() self.encoder = Encoder(config, config.src_vocab_size) self.decoder = Decoder(config) self.bos = config.bos self.s_len = config.s_len self.linear_out = nn.Linear(config.model_size, config.tgt_vocab_size)
def build_model(config, vocab): visual_encoder = VisualEncoder( app_feat=config.vis_encoder.app_feat, mot_feat=config.vis_encoder.mot_feat, app_input_size=config.vis_encoder.app_feat_size, mot_input_size=config.vis_encoder.mot_feat_size, app_output_size=config.vocab.embedding_size, mot_output_size=config.vocab.embedding_size) phrase_encoder = PhraseEncoder( len_max_seq=config.loader.max_caption_len + 2, d_word_vec=config.vocab.embedding_size, n_layers=config.phr_encoder.SA_num_layers, n_head=config.phr_encoder.SA_num_heads, d_k=config.phr_encoder.SA_dim_k, d_v=config.phr_encoder.SA_dim_v, d_model=config.vocab.embedding_size, d_inner=config.phr_encoder.SA_dim_inner, dropout=config.phr_encoder.SA_dropout) decoder = Decoder( num_layers=config.decoder.rnn_num_layers, vis_feat_size=2 * config.vocab.embedding_size, feat_len=config.loader.frame_sample_len, embedding_size=config.vocab.embedding_size, sem_align_hidden_size=config.decoder.sem_align_hidden_size, sem_attn_hidden_size=config.decoder.sem_attn_hidden_size, hidden_size=config.decoder.rnn_hidden_size, output_size=vocab.n_vocabs) model = SGN(visual_encoder, phrase_encoder, decoder, config.loader.max_caption_len, vocab, config.PS_threshold) return model
def __init__(self, channels, h_dim, res_h_dim, n_res_layers, n_embeddings, embedding_dim, beta, save_img_embedding_map=False): super(VQVAE, self).__init__() # encode image into continuous latent space self.encoder = Encoder(channels, h_dim, n_res_layers, res_h_dim) self.pre_quantization_conv = nn.Conv2d(h_dim, embedding_dim, kernel_size=1, stride=1) # pass continuous latent vector through discretization bottleneck self.vector_quantization = VectorQuantizer(n_embeddings, embedding_dim, beta) # decode the discrete latent representation self.decoder = Decoder(channels, embedding_dim, h_dim, n_res_layers, res_h_dim) if save_img_embedding_map: self.img_to_embedding_map = {i: [] for i in range(n_embeddings)} else: self.img_to_embedding_map = None
def __init__(self, args): """ Basic initialization of Transformer. Arguments --------- args: <argparse.Namespace> Arguments used for overall process. """ super().__init__() self.args = args self.num_stacks = self.args.num_stacks self.d_model = self.args.d_model self.vocab_size = self.args.vocab_size self.emb = EmbeddingLayer(self.args) encoders = [Encoder(self.args) for _ in range(self.num_stacks)] self.encoder_stack = nn.Sequential(*encoders) decoders = [Decoder(self.args) for _ in range(self.num_stacks)] self.decoder_stack = nn.ModuleList(decoders) self.output_linear = nn.Linear(in_features=self.d_model, out_features=self.vocab_size, bias=False) self.output_linear.weight = self.emb.embedding_layer.weight self.softmax = nn.LogSoftmax(dim=-1) self.dropout = nn.Dropout(p=0.1)
def __init__(self, h_dim, res_h_dim, n_res_layers, n_embeddings, embedding_dim, beta, restart=True): super(VQVAE, self).__init__() # encode image into continuous latent space self.encoder = Encoder(in_dim=256, h_dim=h_dim, n_res_layers=n_res_layers, res_h_dim=res_h_dim) self.pre_quantization_conv = nn.Conv1d(h_dim, embedding_dim, kernel_size=3, stride=1, padding=1) # Define discretization bottleneck if not restart: self.vector_quantization = VectorQuantizer(n_embeddings, embedding_dim, beta) else: self.vector_quantization = VectorQuantizerRandomRestart( n_embeddings, embedding_dim, beta) # decode the discrete latent representation self.decoder = Decoder(embedding_dim, h_dim, n_res_layers, res_h_dim) #E_indices used in sampling, just save last to rep last latent state self.e_indices = None
def __init__(self, enc_in, dec_in, c_out, seq_len, label_len, out_len, factor=5, d_model=512, n_heads=8, e_layers=3, d_layers=2, d_ff=512, dropout=0.0, attn='prob', embed='fixed', data='ETTh', activation='gelu', device=torch.device('cuda:0')): super(Informer, self).__init__() self.pred_len = out_len self.attn = attn # Encoding self.enc_embedding = DataEmbedding(enc_in, d_model, embed, data, dropout) self.dec_embedding = DataEmbedding(dec_in, d_model, embed, data, dropout) # Attention Attn = ProbAttention if attn == 'prob' else FullAttention # Encoder self.encoder = Encoder([ EncoderLayer(AttentionLayer( Attn(False, factor, attention_dropout=dropout), d_model, n_heads), d_model, d_ff, dropout=dropout, activation=activation) for l in range(e_layers) ], [ConvLayer(d_model) for l in range(e_layers - 1)], norm_layer=torch.nn.LayerNorm(d_model)) # Decoder self.decoder = Decoder([ DecoderLayer( AttentionLayer( FullAttention(True, factor, attention_dropout=dropout), d_model, n_heads), AttentionLayer( FullAttention(False, factor, attention_dropout=dropout), d_model, n_heads), d_model, d_ff, dropout=dropout, activation=activation, ) for l in range(d_layers) ], norm_layer=torch.nn.LayerNorm(d_model)) # self.end_conv1 = nn.Conv1d(in_channels=label_len+out_len, out_channels=out_len, kernel_size=1, bias=True) # self.end_conv2 = nn.Conv1d(in_channels=d_model, out_channels=c_out, kernel_size=1, bias=True) self.projection = nn.Linear(d_model, c_out, bias=True)
def __init__(self, config, device): super(MedicalFSS, self).__init__() self.config = config resize_dim = self.config['input_size'] self.encoded_h = int(resize_dim[0] / 2**self.config['n_pool']) self.encoded_w = int(resize_dim[1] / 2**self.config['n_pool']) self.s_encoder = SupportEncoder(self.config['path']['init_path'], device) # .to(device) self.q_encoder = QueryEncoder(self.config['path']['init_path'], device) # .to(device) self.ConvBiGRU = ConvBGRU(in_channels=512, hidden_channels=256, kernel_size=(3, 3), num_layers=self.config['n_layer'], device=device).to(device) self.decoder = Decoder(input_res=(self.encoded_h, self.encoded_w), output_res=resize_dim).to(device) self.q_slice_n = self.config['q_slice'] self.ch = 256 # number of channels of embedding vector self.n_shot = self.config['n_shot'] self.reversed_idx = list(reversed(range(self.q_slice_n))) self.is_attention = self.config['is_attention'] if self.is_attention: self.avgpool3d = nn.AvgPool3d( (self.ch * 2, self.encoded_w, self.encoded_h)) self.softmax = nn.Softmax(dim=1)
def build_model(C, vocab): decoder = Decoder(rnn_type=C.decoder.rnn_type, num_layers=C.decoder.rnn_num_layers, num_directions=C.decoder.rnn_num_directions, feat_size=C.feat.size, feat_len=C.loader.frame_sample_len, embedding_size=C.vocab.embedding_size, hidden_size=C.decoder.rnn_hidden_size, attn_size=C.decoder.rnn_attn_size, output_size=vocab.n_vocabs, rnn_dropout=C.decoder.rnn_dropout) if C.pretrained_decoder_fpath is not None: decoder.load_state_dict( torch.load(C.pretrained_decoder_fpath)['decoder']) print("Pretrained decoder is loaded from {}".format( C.pretrained_decoder_fpath)) #全局和局部重构器 if C.reconstructor is None: reconstructor = None elif C.reconstructor.type == 'global': reconstructor = GlobalReconstructor( rnn_type=C.reconstructor.rnn_type, num_layers=C.reconstructor.rnn_num_layers, num_directions=C.reconstructor.rnn_num_directions, decoder_size=C.decoder.rnn_hidden_size, hidden_size=C.reconstructor.rnn_hidden_size, rnn_dropout=C.reconstructor.rnn_dropout) else: reconstructor = LocalReconstructor( rnn_type=C.reconstructor.rnn_type, num_layers=C.reconstructor.rnn_num_layers, num_directions=C.reconstructor.rnn_num_directions, decoder_size=C.decoder.rnn_hidden_size, hidden_size=C.reconstructor.rnn_hidden_size, attn_size=C.reconstructor.rnn_attn_size, rnn_dropout=C.reconstructor.rnn_dropout) if C.pretrained_reconstructor_fpath is not None: reconstructor.load_state_dict( torch.load(C.pretrained_reconstructor_fpath)['reconstructor']) print("Pretrained reconstructor is loaded from {}".format( C.pretrained_reconstructor_fpath)) model = CaptionGenerator(decoder, reconstructor, C.loader.max_caption_len, vocab) model.cuda() return model
def test(encoder=None, decoder=None): torch.backends.cudnn.benchmark = True _, dataloader = create_dataloader(config.IMG_DIR + "/test", config.MESH_DIR + "/test", batch_size=config.BATCH_SIZE, used_layers=config.USED_LAYERS, img_size=config.IMAGE_SIZE, map_size=config.MAP_SIZE, augment=config.AUGMENT, workers=config.NUM_WORKERS, pin_memory=config.PIN_MEMORY, shuffle=False) if not encoder or not decoder: in_channels = num_channels(config.USED_LAYERS) encoder = Encoder(in_channels=in_channels) decoder = Decoder(num_classes=config.NUM_CLASSES+1) encoder = encoder.to(config.DEVICE) decoder = decoder.to(config.DEVICE) _, encoder, decoder = load_checkpoint(encoder, decoder, config.CHECKPOINT_FILE, config.DEVICE) loss_fn = LossFunction() loop = tqdm(dataloader, leave=True) losses = [] ious = [] encoder.eval() decoder.eval() for i, (_, layers, volumes, img_files) in enumerate(loop): with torch.no_grad(): layers = layers.to(config.DEVICE, non_blocking=True) volumes = volumes.to(config.DEVICE, non_blocking=True) features = encoder(layers) predictions = decoder(features) loss = loss_fn(predictions, volumes) losses.append(loss.item()) iou = predictions_iou(to_volume(predictions, config.VOXEL_THRESH), volumes) ious.append(iou) mean_iou = sum(ious) / len(ious) mean_loss = sum(losses) / len(losses) loop.set_postfix(loss=mean_loss, mean_iou=mean_iou) if i == 0 and config.PLOT: plot_volumes(to_volume(predictions, config.VOXEL_THRESH).cpu(), img_files, config.NAMES) plot_volumes(volumes.cpu(), img_files, config.NAMES)
def __init__(self, d_model, d_ff, d_K, d_V, n_heads, n_layers, sourceVocabSize, sourceLength, targetVocabSize, targetLength): super(Transformer, self).__init__() self.encoder = Encoder(sourceVocabSize, sourceLength, d_model, d_ff, d_K, d_V, n_heads, n_layers) self.decoder = Decoder(targetVocabSize, targetLength, d_model, d_ff, d_K, d_V, n_heads, n_layers) self.projection = nn.Linear(d_model, targetVocabSize, bias=False)
def __init__(self, num_classes, fixed_height = 48, net='efficientnet'): super(Model, self).__init__() self.encoder = Encoder(net = net) self.decoder = Decoder(input_dim=int(fixed_height * 288 / 8), num_class=num_classes) self.crnn = nn.Sequential( self.encoder, self.decoder ) self.log_softmax = nn.LogSoftmax(dim=2)
def test(test_loader, modelID, showAttn=True): encoder = Encoder(HIDDEN_SIZE_ENC, HEIGHT, WIDTH, Bi_GRU, CON_STEP, FLIP).cuda() decoder = Decoder(HIDDEN_SIZE_DEC, EMBEDDING_SIZE, vocab_size, Attention, TRADEOFF_CONTEXT_EMBED).cuda() seq2seq = Seq2Seq(encoder, decoder, output_max_len, vocab_size).cuda() model_file = 'save_weights/seq2seq-' + str(modelID) + '.model' pretrain_dict = torch.load(model_file) seq2seq_dict = seq2seq.state_dict() pretrain_dict = { k: v for k, v in pretrain_dict.items() if k in seq2seq_dict } seq2seq_dict.update(pretrain_dict) seq2seq.load_state_dict(seq2seq_dict) #load print('Loading ' + model_file) seq2seq.eval() total_loss_t = 0 start_t = time.time() for num, (test_index, test_in, test_in_len, test_out, test_domain) in enumerate(test_loader): lambd = LAMBD test_in, test_out = Variable(test_in, volatile=True).cuda(), Variable( test_out, volatile=True).cuda() test_domain = Variable(test_domain, volatile=True).cuda() output_t, attn_weights_t, out_domain_t = seq2seq(test_in, test_out, test_in_len, lambd, teacher_rate=False, train=False) batch_count_n = writePredict(modelID, test_index, output_t, 'test') test_label = test_out.permute(1, 0)[1:].contiguous().view(-1) if LABEL_SMOOTH: loss_t = crit(log_softmax(output_t.view(-1, vocab_size)), test_label) else: loss_t = F.cross_entropy(output_t.view(-1, vocab_size), test_label, ignore_index=tokens['PAD_TOKEN']) total_loss_t += loss_t.data[0] if showAttn: global_index_t = 0 for t_idx, t_in in zip(test_index, test_in): visualizeAttn(t_in.data[0], test_in_len[0], [j[global_index_t] for j in attn_weights_t], modelID, batch_count_n[global_index_t], 'test_' + t_idx.split(',')[0]) global_index_t += 1 total_loss_t /= (num + 1) writeLoss(total_loss_t, 'test') print(' TEST loss=%.3f, time=%.3f' % (total_loss_t, time.time() - start_t))
def train(): torch.backends.cudnn.benchmark = True _, dataloader = create_dataloader(config.IMG_DIR + "/train", config.MESH_DIR + "/train", batch_size=config.BATCH_SIZE, used_layers=config.USED_LAYERS, img_size=config.IMAGE_SIZE, map_size=config.MAP_SIZE, augment=config.AUGMENT, workers=config.NUM_WORKERS, pin_memory=config.PIN_MEMORY, shuffle=True) in_channels = num_channels(config.USED_LAYERS) encoder = Encoder(in_channels=in_channels) decoder = Decoder(num_classes=config.NUM_CLASSES+1) encoder.apply(init_weights) decoder.apply(init_weights) encoder_solver = torch.optim.Adam(filter(lambda p: p.requires_grad, encoder.parameters()), lr=config.ENCODER_LEARNING_RATE, betas=config.BETAS) decoder_solver = torch.optim.Adam(decoder.parameters(), lr=config.DECODER_LEARNING_RATE, betas=config.BETAS) encoder_lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(encoder_solver, milestones=config.ENCODER_LR_MILESTONES, gamma=config.GAMMA) decoder_lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(decoder_solver, milestones=config.DECODER_LR_MILESTONES, gamma=config.GAMMA) encoder = encoder.to(config.DEVICE) decoder = decoder.to(config.DEVICE) loss_fn = LossFunction() init_epoch = 0 if config.CHECKPOINT_FILE and config.LOAD_MODEL: init_epoch, encoder, decoder = load_checkpoint(encoder, decoder, config.CHECKPOINT_FILE, config.DEVICE) output_dir = os.path.join(config.OUT_PATH, re.sub("[^0-9a-zA-Z]+", "-", dt.now().isoformat())) for epoch_idx in range(init_epoch, config.NUM_EPOCHS): encoder.train() decoder.train() train_one_epoch(encoder, decoder, dataloader, loss_fn, encoder_solver, decoder_solver, epoch_idx) encoder_lr_scheduler.step() decoder_lr_scheduler.step() if config.TEST: test(encoder, decoder) if config.SAVE_MODEL: save_checkpoint(epoch_idx, encoder, decoder, output_dir) if not config.TEST: test(encoder, decoder) if not config.SAVE_MODEL: save_checkpoint(config.NUM_EPOCHS - 1, encoder, decoder, output_dir)
class Quantitative_analysis_demo(): def __init__(self, cfg, output_dir): self.encoder = Encoder(cfg) self.decoder = Decoder(cfg) self.refiner = Refiner(cfg) self.merger = Merger(cfg) # self.thresh = cfg.VOXEL_THRESH self.th = cfg.TEST.VOXEL_THRESH checkpoint = torch.load(cfg.CHECKPOINT) encoder_state_dict = clean_state_dict(checkpoint['encoder_state_dict']) self.encoder.load_state_dict(encoder_state_dict) decoder_state_dict = clean_state_dict(checkpoint['decoder_state_dict']) self.decoder.load_state_dict(decoder_state_dict) if cfg.NETWORK.USE_REFINER: refiner_state_dict = clean_state_dict( checkpoint['refiner_state_dict']) self.refiner.load_state_dict(refiner_state_dict) if cfg.NETWORK.USE_MERGER: merger_state_dict = clean_state_dict( checkpoint['merger_state_dict']) self.merger.load_state_dict(merger_state_dict) self.output_dir = output_dir def calculate_iou(self, imgs, GT_voxels, sid, mid, iid): dir1 = os.path.join(self.output_dir, str(sid), str(mid)) if not os.path.exists(dir1): os.makedirs(dir1) image_features = self.encoder(imgs) raw_features, generated_volume = self.decoder(image_features) generated_volume = self.merger(raw_features, generated_volume) generated_volume = self.refiner(generated_volume) generated_volume = generated_volume.squeeze() sample_iou = [] for th in self.th: _volume = torch.ge(generated_volume, th).float() intersection = torch.sum(_volume.mul(GT_voxels)).float() union = torch.sum(torch.ge(_volume.add(GT_voxels), 1)).float() sample_iou.append((intersection / union).item()) return sample_iou
def main(): parser = argparse.ArgumentParser(description="Training attention model") parser.add_argument( "-t", "--train_data", metavar="train_data", type=str, default='../data/processed/source_replay_twitter_data.txt', dest="train_data", help="set the training data ") parser.add_argument("-e", "--embedding_size", metavar="embedding_size", type=int, default=50, dest="embedding_size", help="set the embedding size ") parser.add_argument("-H", "--hidden_size", metavar="hidden_size", type=int, default=512, dest="hidden_size", help="set the hidden size ") parser.add_argument("-f", "--fine_tune_model_name", metavar="fine_tune_model_name", type=str, default='../models/glove_wiki/glove_model_40.pth', dest="fine_tune_model_name", help="set the fine tune model name ") args = parser.parse_args() data_loader_attention = DataLoaderAttention(file_name=args.train_data) data_loader_attention.load_data() source2index, index2source, target2index, index2target, train_data = \ data_loader_attention.load_data() EMBEDDING_SIZE = args.embedding_size HIDDEN_SIZE = args.hidden_size encoder = Encoder(len(source2index), EMBEDDING_SIZE, HIDDEN_SIZE, 3, True) decoder = Decoder(len(target2index), EMBEDDING_SIZE, HIDDEN_SIZE * 2) trainer = Trainer(epoch=600, batch_size=64, fine_tune_model=args.fine_tune_model_name) trainer.train_attention(train_data=train_data, source2index=source2index, target2index=target2index, index2source=index2source, index2target=index2target, encoder_model=encoder, decoder_model=decoder)
def test(test_loader, modelID, showAttn=True): encoder = Encoder(HIDDEN_SIZE_ENC, HEIGHT, WIDTH, Bi_GRU, CON_STEP, FLIP).to(device) decoder = Decoder(HIDDEN_SIZE_DEC, EMBEDDING_SIZE, vocab_size, Attention, TRADEOFF_CONTEXT_EMBED).to(device) seq2seq = Seq2Seq(encoder, decoder, output_max_len, vocab_size).to(device) model_file = 'save_weights/seq2seq-' + str(modelID) + '.model' print('Loading ' + model_file) seq2seq.load_state_dict(torch.load(model_file)) #load seq2seq.eval() total_loss_t = 0 start_t = time.time() with torch.no_grad(): for num, (test_index, test_in, test_in_len, test_out) in enumerate(test_loader): #test_in = test_in.unsqueeze(1) test_in, test_out = test_in.to(device), test_out.to(device) if test_in.requires_grad or test_out.requires_grad: print( 'ERROR! test_in, test_out should have requires_grad=False') output_t, attn_weights_t = seq2seq(test_in, test_out, test_in_len, teacher_rate=False, train=False) batch_count_n = writePredict(modelID, test_index, output_t, 'test') test_label = test_out.permute(1, 0)[1:].reshape(-1) #loss_t = F.cross_entropy(output_t.view(-1, vocab_size), # test_label, ignore_index=tokens['PAD_TOKEN']) #loss_t = loss_label_smoothing(output_t.view(-1, vocab_size), test_label) if LABEL_SMOOTH: loss_t = crit(log_softmax(output_t.reshape(-1, vocab_size)), test_label) else: loss_t = F.cross_entropy(output_t.reshape(-1, vocab_size), test_label, ignore_index=tokens['PAD_TOKEN']) total_loss_t += loss_t.item() if showAttn: global_index_t = 0 for t_idx, t_in in zip(test_index, test_in): visualizeAttn(t_in.detach()[0], test_in_len[0], [j[global_index_t] for j in attn_weights_t], modelID, batch_count_n[global_index_t], 'test_' + t_idx.split(',')[0]) global_index_t += 1 total_loss_t /= (num + 1) writeLoss(total_loss_t, 'test') print(' TEST loss=%.3f, time=%.3f' % (total_loss_t, time.time() - start_t))
def __init__(self, hparams): super().__init__() self.hparams = hparams # Encoder self.encoder = Encoder(ngf=self.hparams.ngf, z_dim=self.hparams.z_dim) self.encoder.apply(weights_init) device = "cuda" if isinstance(self.hparams.gpus, int) else "cpu" # Decoder self.decoder = Decoder(ngf=self.hparams.ngf, z_dim=self.hparams.z_dim) self.decoder.apply(weights_init) # Discriminator self.discriminator = Discriminator() self.discriminator.apply(weights_init) # Losses self.criterionFeat = torch.nn.L1Loss() self.criterionGAN = GANLoss(gan_mode="lsgan") if self.hparams.use_vgg: self.criterion_perceptual_style = [Perceptual_Loss(device)]
def build_model(vocab): decoder = Decoder(rnn_type=C.decoder.rnn_type, num_layers=C.decoder.rnn_num_layers, num_directions=C.decoder.rnn_num_directions, feat_size=C.feat.size, feat_len=C.loader.frame_sample_len, embedding_size=C.vocab.embedding_size, hidden_size=C.decoder.rnn_hidden_size, attn_size=C.decoder.rnn_attn_size, output_size=vocab.n_vocabs, rnn_dropout=C.decoder.rnn_dropout) if C.pretrained_decoder_fpath is not None: decoder.load_state_dict( torch.load(C.pretrained_decoder_fpath)['decoder']) print("Pretrained decoder is loaded from {}".format( C.pretrained_decoder_fpath)) model = CaptionGenerator(decoder, C.loader.max_caption_len, vocab) model.cuda() return model
def __init__(self, cfg_network: DictConfig, cfg_tester: DictConfig): super().__init__() self.cfg_network = cfg_network self.cfg_tester = cfg_tester # Enable the inbuilt cudnn auto-tuner to find the best algorithm to use torch.backends.cudnn.benchmark = True # Set up networks self.encoder = Encoder(cfg_network) self.decoder = Decoder(cfg_network) self.refiner = Refiner(cfg_network) self.merger = Merger(cfg_network) # Initialize weights of networks self.encoder.apply(utils.network_utils.init_weights) self.decoder.apply(utils.network_utils.init_weights) self.refiner.apply(utils.network_utils.init_weights) self.merger.apply(utils.network_utils.init_weights) self.bce_loss = nn.BCELoss()
def __init__(self, cfg, output_dir): self.encoder = Encoder(cfg) self.decoder = Decoder(cfg) self.refiner = Refiner(cfg) self.merger = Merger(cfg) # self.thresh = cfg.VOXEL_THRESH self.th = cfg.TEST.VOXEL_THRESH checkpoint = torch.load(cfg.CHECKPOINT) encoder_state_dict = clean_state_dict(checkpoint['encoder_state_dict']) self.encoder.load_state_dict(encoder_state_dict) decoder_state_dict = clean_state_dict(checkpoint['decoder_state_dict']) self.decoder.load_state_dict(decoder_state_dict) if cfg.NETWORK.USE_REFINER: refiner_state_dict = clean_state_dict(checkpoint['refiner_state_dict']) self.refiner.load_state_dict(refiner_state_dict) if cfg.NETWORK.USE_MERGER: merger_state_dict = clean_state_dict(checkpoint['merger_state_dict']) self.merger.load_state_dict(merger_state_dict) self.output_dir = output_dir