コード例 #1
0
def inference_coco(encoder_file: str, decoder_file: str, embed_size: int,
                   hidden_size: int, from_cpu: bool) -> None:
    """
    Displays an original image from coco test dataset and prints its associated caption.

    encoder_file:   Name of the encoder to load.
    decoder_file:   Name of the decoder to load.
    embed_size:     Word embedding size for the encoder.
    hidden_size:    Hidden layer of the LSTM size.
    from_cpu:       Whether the model has been saved on CPU.
    """
    # Define transform
    transform_test = transforms.Compose([
        transforms.Resize(256),  # smaller edge of image resized to 256
        transforms.RandomCrop(224),  # get 224x224 crop from random location
        transforms.ToTensor(),  # convert the PIL Image to a tensor
        transforms.Normalize(
            (0.485, 0.456, 0.406),  # normalize image for pre-trained model
            (0.229, 0.224, 0.225))
    ])

    # Device to use fo inference
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Create the data loader.
    data_loader = get_loader(transform=transform_test, mode='test')

    # Obtain sample image
    _, image = next(iter(data_loader))

    # The size of the vocabulary.
    vocab_size = len(data_loader.dataset.vocab)

    # Initialize the encoder and decoder, and set each to inference mode.
    encoder = EncoderCNN(embed_size)
    encoder.eval()
    decoder = DecoderRNN(embed_size, hidden_size, vocab_size)
    decoder.eval()

    # Load the trained weights.
    if from_cpu:
        encoder.load_state_dict(
            torch.load(os.path.join('./models', encoder_file),
                       map_location='cpu'))
        decoder.load_state_dict(
            torch.load(os.path.join('./models', decoder_file),
                       map_location='cpu'))
    else:
        encoder.load_state_dict(
            torch.load(os.path.join('./models', encoder_file)))
        decoder.load_state_dict(
            torch.load(os.path.join('./models', decoder_file)))

    # Move models to GPU if CUDA is available.
    encoder.to(device)
    decoder.to(device)

    get_prediction(encoder, decoder, data_loader, device)
コード例 #2
0
ファイル: sample.py プロジェクト: ZVengin/Image-Captioning
def main(args):
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    vocab = Vocabulary.load_vocab(args['data_dir'])
    args['vocab_size'] = len(vocab)
    encoder = EncoderCNN(args).eval()
    decoder = DecoderRNN(args)
    encoder.to(device)
    decoder.to(device)

    encoder.load_state_dict(
        torch.load(os.path.join(args['model_dir'], args['encoder_name'])))
    decoder.load_state_dict(
        torch.load(os.path.join(args['model_dir'], args['decoder_name'])))

    test_caption_list = []
    for file_name in os.listdir(
            os.path.join(args['data_dir'], args['image_dir'])):
        if os.path.isfile(
                os.path.join(args['data_dir'], args['image_dir'], file_name)):
            image = load_image(
                os.path.join(args['data_dir'], args['image_dir'], file_name),
                transform)
            image_tensor = image.to(device)
        else:
            continue

        feature = encoder(image_tensor)
        sample_ids = decoder.sample(feature)
        sample_ids = sample_ids[0].cpu().numpy()

        sample_caption = []
        for word_id in sample_ids:
            word = vocab.idx2word[word_id]
            sample_caption.append(word)
            if word == '<end>':
                break

        sentence = ' '.join(sample_caption)
        print(sentence)
        test_caption_list.append((file_name, sentence))


#        image=Image.open(os.path.join(args['data_dir'],args['image_dir'],file_name))
#        plt.imshow(np.asarray(image))

    with open(os.path.join(args['data_dir'], 'test_caption.txt'), 'w') as f:
        for item in test_caption_list:
            f.write('image_name:{} ---- generated_caption:{}\n'.format(
                item[0], item[1]))
            f.write('\n')
コード例 #3
0
def main():
    st.title('Image Captioning App')
    st.markdown(STYLE, unsafe_allow_html=True)

    file = st.file_uploader("Upload file", type=["png", "jpg", "jpeg"])
    show_file = st.empty()

    if not file:
        show_file.info("Please upload a file of type: " +
                       ", ".join(["png", "jpg", "jpeg"]))
        return

    content = file.getvalue()

    show_file.image(file)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    encoder_file = 'encoder-5-batch-128-hidden-256-epochs-5.pkl'
    decoder_file = 'decoder-5-batch-128-hidden-256-epochs-5.pkl'

    embed_size = 300
    hidden_size = 256

    vocab_size, word2idx, idx2word = get_vocab()

    encoder = EncoderCNN(embed_size)
    encoder.eval()
    decoder = DecoderRNN(embed_size, hidden_size, vocab_size)
    decoder.eval()

    encoder.load_state_dict(torch.load(os.path.join('./models', encoder_file)))
    decoder.load_state_dict(torch.load(os.path.join('./models', decoder_file)))

    encoder.to(device)
    decoder.to(device)

    transform_test = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])
    PIL_image = Image.open(file).convert('RGB')
    orig_image = np.array(PIL_image)
    image = transform_test(PIL_image)
    image = image.to(device).unsqueeze(0)
    features = encoder(image).unsqueeze(1)
    output = decoder.sample(features)

    sentence = clean_sentence(output, idx2word)
    st.info("Generated caption --> " + sentence)

    file.close()
コード例 #4
0
    def getCaption(self,
                   imgs,
                   output_path='',
                   vocab_path='data/vocab.pkl',
                   decoder_path='models/decoder-5-3000.pkl',
                   encoder_path='models/encoder-5-3000.pkl',
                   embed_size=256,
                   hidden_size=512,
                   num_layers=1):
        if (output_path == ''):
            output_path = self.DEFAULT_OUTPUT_PATH
        device = self.device
        transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
        ])

        # Load vocabulary wrapper
        with open(vocab_path, 'rb') as f:
            vocab = pickle.load(f)

        # Build models
        encoder = EncoderCNN(embed_size).eval(
        )  # eval mode (batchnorm uses moving mean/variance)
        decoder = DecoderRNN(embed_size, hidden_size, len(vocab), num_layers)
        encoder = encoder.to(device)
        decoder = decoder.to(device)

        # Load the trained model parameters
        encoder.load_state_dict(torch.load(encoder_path))
        decoder.load_state_dict(torch.load(decoder_path))

        CAPTIONS = []

        for img in imgs:
            # Prepare an image
            image = self.load_image(img, transform=transform)
            image_tensor = image.to(device)

            # Generate an caption from the image
            feature = encoder(image_tensor)
            sampled_ids = decoder.sample(feature)
            sampled_ids = sampled_ids[0].cpu().numpy(
            )  # (1, max_seq_length) -> (max_seq_length)

            # Convert word_ids to words
            sampled_caption = []
            for word_id in sampled_ids:
                word = vocab.idx2word[word_id]
                sampled_caption.append(word)
                if word == '<end>':
                    break
            sentence = ' '.join(sampled_caption)

            # Print out the image and the generated caption
            CAPTIONS.append(self.prune_caption(sentence))

        json_captions = self.writeJSON(imgs, CAPTIONS, output_path=output_path)

        return json_captions
コード例 #5
0
ファイル: test.py プロジェクト: qiz2/dl_project
def main(args):
    # Image preprocessing, normalization for the pretrained resnet
    transform = transforms.Compose([
        transforms.ToTensor(), 
        transforms.Normalize((0.485, 0.456, 0.406), 
                             (0.229, 0.224, 0.225))])
    
    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build models
    encoder = EncoderCNN(args.embed_size).eval()  # eval mode (batchnorm uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))
    
    # Build data loader
    data_loader = get_loader(args.image_dir, args.caption_path, vocab, 
                             transform, args.batch_size,
                             shuffle=True, num_workers=args.num_workers) 

    total_step = len(data_loader)

    # List to score the BLEU scores
    bleu_scores = []

    for i, (images, captions, lengths) in enumerate(data_loader):
        
        # Set mini-batch dataset
        images = images.to(device)
        # captions = captions.to(device)
        
        # Generate an caption from the image
        feature = encoder(images)
        sampled_ids = decoder.sample(feature)
        sampled_ids = sampled_ids[0].cpu().numpy()

        # Convert word_ids to words
        sampled_caption = []
        for word_id in sampled_ids:
            word = vocab.idx2word[word_id]
            sampled_caption.append(word)
            if word == '<end>':
                break
        sentence = ' '.join(sampled_caption)

        score = sentence_bleu(captions, sentence, args.bleu_weights)
        bleu_scores.append(score)

        # Print log info
        if i % args.log_step == 0:
            print('Finish [{}/{}], Current BLEU Score: {:.4f}'
                  .format(i, total_step, np.mean(bleu_scores)))

    np.save('test_results.npy', [bleu_scores, np.mean(bleu_scores)])
コード例 #6
0
ファイル: sample.py プロジェクト: congphase/cv-nlp-end-term
def run_inference(image_path,
                  encoder_path,
                  decoder_path,
                  vocab_path,
                  embed_size=256,
                  hidden_size=512,
                  num_layers=1):
    print(f'sample.py running ... ')
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper
    with open(vocab_path, 'rb') as f:
        print("using " + vocab_path)
        vocab = pickle.load(f)

    # Build models
    encoder = EncoderCNN(
        embed_size).eval()  # eval mode (batchnorm uses moving mean/variance)
    decoder = DecoderRNN(embed_size, hidden_size, len(vocab), num_layers)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Load the trained model parameters
    encoder.load_state_dict(
        torch.load(encoder_path, map_location=torch.device('cpu')))
    decoder.load_state_dict(
        torch.load(decoder_path, map_location=torch.device('cpu')))

    # Prepare an image
    image = load_image(image_path, transform)
    image_tensor = image.to(device)

    # Generate an caption from the image
    feature = encoder(image_tensor)
    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids[0].cpu().numpy(
    )  # (1, max_seq_length) -> (max_seq_length)

    # Convert word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        print(word)
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption).replace('<start>', '')
    sentence = sentence.replace('<end>', '')
    sentence = sentence.replace('_', ' ')

    # Print out the image and the generated caption
    print(sentence)

    print(f'debug: chay xong roi ne')
    return sentence.strip().capitalize()
コード例 #7
0
    def get_caption(self, img_tensor):
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(device)
        print("running")

        # Models
        encoder_file = 'legit_model/encoder_1.pkl'
        decoder_file = 'legit_model/decoder_1.pkl'

        # Embed and hidden
        embed_size = 512
        hidden_size = 512

        # The size of the vocabulary.
        vocab_size = 8856

        # Initialize the encoder and decoder, and set each to inference mode.
        encoder = EncoderCNN(embed_size)
        encoder.eval()

        decoder = DecoderRNN(embed_size, hidden_size, vocab_size)
        decoder.eval()

        # Load the trained weights.
        encoder.load_state_dict(
            torch.load(os.path.join('./models', encoder_file)))
        decoder.load_state_dict(
            torch.load(os.path.join('./models', decoder_file)))

        # Move models to GPU if CUDA is available.
        encoder.to(device)
        decoder.to(device)

        img_d = img_tensor.to(device)

        # Obtain the embedded image features.
        features = encoder(img_d).unsqueeze(1)

        # Pass the embedded image features through the model to get a predicted caption.
        img_output = decoder.sample(features)

        sentence = self.clean_sentence(img_output)

        return sentence
コード例 #8
0
def main(args):
    # Image preprocessing
    prediction = []
    transform = transforms.Compose([
        transforms.ToTensor(), 
        transforms.Normalize((0.485, 0.456, 0.406), 
                             (0.229, 0.224, 0.225))])
    
    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build models
    encoder = EncoderCNN(args.embed_size).eval()  # eval mode (batchnorm uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    dirname=''

    fnames = listdir(os.getcwd)
    #with open(dirname)
    for fname in fnames:
        
        #print(fname)
        # Prepare an image
        image = load_image(''+fname, transform)
        image_tensor = image.to(device)
        
        # Generate an caption from the image
        feature = encoder(image_tensor)
        sampled_ids = decoder.sample(feature)
        sampled_ids = sampled_ids[0].cpu().numpy()          # (1, max_seq_length) -> (max_seq_length)
        
        # Convert word_ids to words
        sampled_caption = []
        for word_id in sampled_ids:
            word = vocab.idx2word[word_id]
            sampled_caption.append(word)
            if word == '<end>':
                break
        sentence = ' '.join(sampled_caption)
        current_pred = [fname, sentence]
        predictions.append(current_pred)
        # Print out the image and the generated caption
        print(fname)
        print (sentence)
        #image = Image.open(args.image)
        #plt.imshow(np.asarray(image))

    df = pd.DataFrame(predictions, columns=['File Name', 'Caption'])
    df.to_excel('output.xls')
コード例 #9
0
ファイル: eval.py プロジェクト: willyandan/tg-neuraltalk
class Neuraltalk2:

  def __init__(self):
    print("Defining I.A")
    # Device configuration
    self.device = torch.device('cpu')

    #vars
    embed_size = 256
    hidden_size = 512
    num_layers = 1
    encoder_path = 'models/encoder-5-3000.pkl'
    decoder_path = 'models/decoder-5-3000.pkl'
    vocab_path = 'data/vocab.pkl'

    # Image preprocessing
    self.transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))])

    with open(vocab_path, 'rb') as f:
      self.vocab = pickle.load(f)

    print("Building Model")
    # Build models
    self.encoder = EncoderCNN(embed_size).eval()  # eval mode (batchnorm uses moving mean/variance)
    self.decoder = DecoderRNN(embed_size, hidden_size, len(self.vocab), num_layers)
    self.encoder = self.encoder.to(self.device)
    self.decoder = self.decoder.to(self.device)

    print("loading checkpoint")
    # Load the trained model parameters
    self.encoder.load_state_dict(torch.load(encoder_path))
    self.decoder.load_state_dict(torch.load(decoder_path))

  def eval_image(self, image_path):
    # Prepare an image
    image = load_image(image_path, self.transform)
    image_tensor = image.to(self.device)
    
    # Generate an caption from the image
    feature = self.encoder(image_tensor)
    sampled_ids = self.decoder.sample(feature)
    sampled_ids = sampled_ids[0].cpu().numpy()
    
    # Convert word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
      word = self.vocab.idx2word[word_id]
      if word == '<end>':
        break
      if word == '<start>':
        continue
      sampled_caption.append(word)
        
    sentence = ' '.join(sampled_caption)
    return sentence
コード例 #10
0
def evaluate(encoder_model_path, decoder_model_path):
    transformation = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    VAL_LOADER_UNIQUE = {
        'root': config.VAL_IMG_PATH,
        'json': config.VAL_JSON_PATH,
        'batch_size': 16,
        'shuffle': False,
        'transform': transformation,
        'num_workers': 4
    }
    val_loader_unique = get_loader_unique(**VAL_LOADER_UNIQUE)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    encoder = CNNfull(2048)
    encoder.to(device)
    decoder = DecoderRNN(2048, 300, 512, vocab_size)
    decoder.to(device)
    encoder.load_state_dict(torch.load(encoder_model_path))
    decoder.load_state_dict(torch.load(decoder_model_path))
    encoder.eval()
    decoder.eval()

    bleu2, bleu3, bleu4, meteor = val_epoch(val_loader_unique,
                                            device,
                                            encoder,
                                            decoder,
                                            vocab,
                                            0,
                                            enc_scheduler=None,
                                            dec_scheduler=None,
                                            view_val_captions=False)
    print(f'Bleu2 score:{bleu2}')
    print(f'Bleu3 score:{bleu3}')
    print(f'Bleu4 score:{bleu4}')
    print(f'Meteor score:{meteor}')
コード例 #11
0
    def predict(self, args):
        print('predict..start')
        device = torch.device('cpu')

        # Image preprocessing
        transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
        ])

        # Load vocabulary wrapper
        # Load vocabulary wrapper
        #vocab = Vocabulary()

        with open(args.vocab_path, 'rb') as f:
            vocab = pickle.load(f)

        # Build models
        encoder = EncoderCNN(args.embed_size).eval(
        )  # eval mode (batchnorm uses moving mean/variance)
        decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                             args.num_layers)
        encoder = encoder.to(device)
        decoder = decoder.to(device)

        # Load the trained model parameters
        encoder.load_state_dict(
            torch.load(args.encoder_path, map_location=device))
        decoder.load_state_dict(
            torch.load(args.decoder_path, map_location=device))

        # Prepare an image
        image = self.load_image(args.image, transform)
        image_tensor = image.to(device)

        # Generate an caption from the image
        feature = encoder(image_tensor)
        sampled_ids = decoder.sample(feature)
        sampled_ids = sampled_ids[0].cpu().numpy(
        )  # (1, max_seq_length) -> (max_seq_length)

        # Convert word_ids to words
        sampled_caption = []
        for word_id in sampled_ids:
            word = vocab.idx2word[word_id]
            sampled_caption.append(word)
            if word == '<end>':
                break
        sentence = ' '.join(sampled_caption)

        # Print out the image and the generated caption
        print(sentence)
        return sentence
コード例 #12
0
def sample(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build models
    encoder = EncoderCNN(args.embed_size).eval(
    )  # eval mode (batchnorm uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)

    dissection.replace_layers(encoder, [
        ('resnet.7.2.bn3', 'final_layer'),
    ])
    vec = torch.zeros(2048).to(device)
    vec[0] = 100
    encoder.replacement['final_layer'] = vec

    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Prepare an image
    image = load_image(args.image, transform)
    image_tensor = image.to(device)

    # Generate an caption from the image
    feature = encoder(image_tensor)
    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids[0].cpu().numpy(
    )  # (1, max_seq_length) -> (max_seq_length)

    # Convert word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption)

    # Print out the image and the generated caption
    print(sentence)
コード例 #13
0
def main(cfg):

    # print(cfg.pretty())
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(cfg.image.mean, cfg.image.std)
    ])
    print(hydra.utils.to_absolute_path(cfg.train.vocab_path))
    with open(hydra.utils.to_absolute_path(cfg.train.vocab_path), 'rb') as f:
        vocab = pickle.load(f)

    # モデルの構築
    encoder = EncoderCNN(cfg.train.embed_size).eval()
    decoder = DecoderRNN(cfg.train.embed_size, cfg.train.hidden_size,
                         len(vocab), cfg.train.num_layers)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # 学習済みモデルのパラメータを読み込む
    encoder.load_state_dict(
        torch.load(hydra.utils.to_absolute_path(cfg.train.encoder_path)))
    decoder.load_state_dict(
        torch.load(hydra.utils.to_absolute_path(cfg.train.decoder_path)))

    with open(json_dir, encoding='utf-8') as f:
        data = json.loads(f.read())

        for key in data['images']:
            img_file_name = key['file_name']
            img_file_path = base_dir + '/data/val2014/' + img_file_name

            # 画像の準備
            image = load_image(hydra.utils.to_absolute_path(img_file_path),
                               transform)
            image_tensor = image.to(device)

            # 入力した画像からキャプションを生成する
            feature = encoder(image_tensor)
            sampled_ids = decoder.sample(feature)
            sampled_ids = sampled_ids[0].cpu().numpy()

            # word_idsをwordに変換する
            sampled_caption = []
            for word_id in sampled_ids:
                word = vocab.idx2word[word_id]
                sampled_caption.append(word)
                if word == '<end>':
                    break
            # <start>,<end>,"."を取り除いて処理を行う
            sentence = ' '.join(sampled_caption[1:-2])

            print(sentence)
コード例 #14
0
ファイル: sample.py プロジェクト: daydreamt/arcelormittal
def main2(image,
          encoder_path='models/encoder-5-3000.pkl',
          decoder_path='models/decoder-5-3000.pkl',
          vocab_path="data/vocab.pkl",
          embed_size=256,
          hidden_size=512,
          num_layers=1):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper
    with open(vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build models
    encoder = EncoderCNN(
        embed_size).eval()  # eval mode (batchnorm uses moving mean/variance)
    decoder = DecoderRNN(embed_size, hidden_size, len(vocab), num_layers)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(encoder_path))
    decoder.load_state_dict(torch.load(decoder_path))

    # Prepare an image
    image = load_image(image, transform)
    image_tensor = image.to(device)

    # Generate an caption from the image
    feature = encoder(image_tensor)
    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids[0].cpu().numpy(
    )  # (1, max_seq_length) -> (max_seq_length)

    # Convert word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption)

    # Print out the image and the generated caption
    print(sentence)
    #image = Image.open(args.image)
    #plt.imshow(np.asarray(image))
    return sentence
コード例 #15
0
def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build models
    encoder = EncoderCNN(args.embed_size).eval(
    )  # eval mode (batchnorm uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Prepare an image
    image = load_image(args.image, transform)
    image_tensor = image.to(device)

    # Generate an caption from the image
    feature = encoder(image_tensor)
    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids[0].cpu().numpy(
    )  # (1, max_seq_length) -> (max_seq_length)

    # Convert word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    sampled_caption.remove('<end>')
    sampled_caption.remove('<start>')
    sentence = ' '.join(sampled_caption)
    # Print out the image and the generated caption
    print(sentence)
    f = open("demofile3.txt", "w")
    f.truncate(0)
    f.write(sentence)
    f.close()
    image = Image.open(args.image)
    plt.imshow(np.asarray(image))
コード例 #16
0
def main(args):

    # Build models
    encoder = EncoderCNN(args.embed_size).eval(
    )  # eval mode (batchnorm uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(
        args.encoder_path))  #path for dumbing output of encoder model
    decoder.load_state_dict(torch.load(
        args.decoder_path))  #path for dumbing output of decoder model

    test_data = generate_training_data(5)
    textures_test = generate_textures(test_data)
    transforms_test = generate_transforms(test_data)
    for i in range(len(textures_test)):
        plt.imsave('predictions/texture4_0%i.png' % i,
                   textures_test[i],
                   cmap="gray")

    print(transforms_test)
    predicted_progs = []

    for texture in textures_test:
        texture = torch.tensor(texture, device=device)
        texture = texture.unsqueeze(0)
        texture = texture.unsqueeze(
            0)  #for EncoderCNN ought to unsqueeze twice
        feature = encoder(texture)
        sampled_seq = decoder.sample(feature)
        sampled_seq = sampled_seq[0].cpu().numpy(
        )  # (1, max_seq_length) -> (max_seq_length)

        # Convert sampled sequence of transforms to words
        prog = []
        for int_word in sampled_seq:
            word = int_to_word(int_word)
            prog.append(word)
            if word == '<end>':
                break
        trans_seq = '-->'.join(prog)
        predicted_progs.append([trans_seq])

    # Print out the sequence of generated transform sequences
    print(predicted_progs)
コード例 #17
0
def check_decoder(features, captions):
	# Specify the number of features in the hidden state of the RNN decoder.
	hidden_size = 512

	# Initialize the decoder.
	decoder = DecoderRNN(embed_size, hidden_size, vocab_size)

	# Move the decoder to GPU if CUDA is available.
	decoder.to(device)
	    
	# Move last batch of captions (from Step 1) to GPU if CUDA is available 
	captions = captions.to(device)

	# Pass the encoder output and captions through the decoder.
	outputs = decoder(features, captions)

	print('type(outputs):', type(outputs))
	print('outputs.shape:', outputs.shape)

	# Check that your decoder satisfies some requirements of the project! :D
	assert type(outputs)==torch.Tensor, "Decoder output needs to be a PyTorch Tensor."
	assert (outputs.shape[0]==batch_size) & (outputs.shape[1]==captions.shape[1]) & (outputs.shape[2]==vocab_size), "The shape of the decoder output is incorrect."

	return outputs
コード例 #18
0
ファイル: operation.py プロジェクト: HJMengx/Image_caption
def get_model(device,vocab_size):
    # model weights file
    encoder_file = "models/encoder-3.pkl" 
    decoder_file = "models/decoder-3.pkl"

    embed_size = 512
    hidden_size = 512

    # Initialize the encoder and decoder, and set each to inference mode.
    encoder = EncoderCNN(embed_size)
    encoder.eval()
    decoder = DecoderRNN(embed_size, hidden_size, vocab_size)
    decoder.eval()

    # Load the trained weights.
    #print(torch.load(encoder_file))
    encoder.load_state_dict(torch.load(encoder_file))
    decoder.load_state_dict(torch.load(decoder_file))

    # Move models to GPU if CUDA is available.
    encoder.to(device)
    decoder.to(device)

    return encoder,decoder
コード例 #19
0
ファイル: sample.py プロジェクト: dhruvramani/image-caption
def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(), 
        transforms.Normalize((0.485, 0.456, 0.406), 
                             (0.229, 0.224, 0.225))])
    
    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build models
    encoder = EncoderCNN(args.embed_size).eval()  # eval mode (batchnorm uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Prepare an image
    '''
    image = load_image(args.image, transform)
    image_tensor = image.to(device)
    '''
    data_loader, _ = get_loader(transforms=False)
    inp, targets = next(iter(data_loader))
    audio = inp_transform(inp)
    audio = audio.to(device)
    
    # Generate an caption from the image
    feature = encoder(audio)
    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids[0].cpu().numpy()          # (1, max_seq_length) -> (max_seq_length)
    
    # Convert word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption)
    
    # Print out the image and the generated caption
    print("Logits : {}\nTarget : {}".format(sentence, targets))
    '''
コード例 #20
0
ファイル: fpgui.py プロジェクト: vshantam/ImageCaptioning
        def generatecaption(image):
            # Image preprocessing
            device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
            transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.485, 0.456, 0.406),
                                     (0.229, 0.224, 0.225))])

            # Load vocabulary wrapper
            with open('/root/ImageCaptioning/data/vocab.pkl', 'rb') as f:
                vocab = pickle.load(f)

            # Build models
            encoder = EncoderCNN(256).eval()  # eval mode (batchnorm uses moving mean/variance)
            decoder = DecoderRNN(256, 512, len(vocab), 1)
            encoder = encoder.to(device)
            decoder = decoder.to(device)

            # Load the trained model parameters
            encoder.load_state_dict(torch.load('models/encoder-5-3000.pkl', map_location='cpu'))
            decoder.load_state_dict(torch.load('models/decoder-5-3000.pkl', map_location='cpu'))

            encoder.eval()
            decoder.eval()
            # Prepare an image
            image = load_image(image, transform)
            image_tensor = image.to(device)

            # Generate an caption from the image
            feature = encoder(image_tensor)
            sampled_ids = decoder.sample(feature)
            sampled_ids = sampled_ids[0].cpu().numpy()          # (1, max_seq_length) -> (max_seq_length)

            # Convert word_ids to words
            sampled_caption = []
            for word_id in sampled_ids:
                word = vocab.idx2word[word_id]
                sampled_caption.append(word)
                if word == '<end>':
                    break
            self.sentence = ' '.join(sampled_caption)

            # Print out the image and the generated caption


            self.Entry1.delete(0, END)
            self.Entry1.insert(0,self.sentence[7:-5])
コード例 #21
0
def img2txt(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper
    with open(args['vocab_path'], 'rb') as f:
        vocab = pickle.load(f)

    # Build models
    encoder = EncoderCNN(args['embed_size']).eval(
    )  # eval mode (batchnorm uses moving mean/variance)
    decoder = DecoderRNN(args['embed_size'], args['hidden_size'], len(vocab),
                         args['num_layers'])
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args['encoder_path']))
    decoder.load_state_dict(torch.load(args['decoder_path']))

    # Prepare an image
    image = load_image(args['image'], transform)
    image_tensor = image.to(device)

    # Generate an caption from the image
    feature = encoder(image_tensor)
    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids[0].cpu().numpy(
    )  # (1, max_seq_length) -> (max_seq_length)

    # Convert word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption).lstrip('<start>').rstrip(
        '<end>').strip()

    # Print out the image and the generated caption
    print(sentence)
    return f'[这个图翻译如下]:\n{sentence}\n{translate(sentence)}'
コード例 #22
0
def main(cfg):
    # print(cfg.pretty())
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(cfg.image.mean, cfg.image.std)
    ])
    print(hydra.utils.to_absolute_path(cfg.train.vocab_path))
    with open(hydra.utils.to_absolute_path(cfg.train.vocab_path), 'rb') as f:
        vocab = pickle.load(f)

    # モデルの構築
    encoder = EncoderCNN(cfg.train.embed_size).eval()
    decoder = DecoderRNN(cfg.train.embed_size, cfg.train.hidden_size,
                         len(vocab), cfg.train.num_layers)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # 学習済みモデルのパラメータを読み込む
    encoder.load_state_dict(
        torch.load(hydra.utils.to_absolute_path(cfg.train.encoder_path)))
    decoder.load_state_dict(
        torch.load(hydra.utils.to_absolute_path(cfg.train.decoder_path)))

    # 画像の準備
    image = load_image(hydra.utils.to_absolute_path(cfg.sample.image_path),
                       transform)
    image_tensor = image.to(device)

    # 入力した画像からキャプションを生成する
    feature = encoder(image_tensor)
    # sampled_ids = decoder.sample(feature)
    sampled_ids = decoder.greedy_decode(features=feature)
    sampled_ids = sampled_ids[0].cpu().numpy()

    # word_idsをwordに変換する
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption)

    print(sentence)
    image = Image.open(hydra.utils.to_absolute_path(cfg.sample.image_path))
    plt.imshow(np.asarray(image))
コード例 #23
0
def main(args):
    # Goruntu on isleme
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Kelime sarmalayiciyi yukle
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Model olusturma
    encoder = EncoderCNN(args.embed_size).eval(
    )  # eval mode (batchnorm uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Egitilmis model parametrelerini yukleyin
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Bir goruntu hazirla
    image = load_image(args.image, transform)
    image_tensor = image.to(device)

    # Resimden bir resim yazisi olusturun
    feature = encoder(image_tensor)
    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids[0].cpu().numpy(
    )  # (1, max_seq_length) -> (max_seq_length)

    # Word_ids kelimelerini kelimelere donustur
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption)

    # Resmi ve olusturulan basligi yazdirin
    print(sentence)
    image = Image.open(args.image)
    plt.imshow(np.asarray(image))
コード例 #24
0
def main(args):

    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build models
    encoder = EncoderCNN(args.embed_size).eval(
    )  # eval mode (batchnorm uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    images = glob.glob('../data/flickr/train/*.jpg')
    for i in images:
        # Prepare an image
        image = load_image(i, transform)
        image_tensor = image.to(device)

        # Generate an caption from the image
        feature = encoder(image_tensor)
        sampled_ids = decoder.sample(feature)
        sampled_ids = sampled_ids[0].cpu().numpy(
        )  # (1, max_seq_length) -> (max_seq_length)

        # Convert word_ids to words
        sampled_caption = []
        for word_id in sampled_ids:
            word = vocab.idx2word[word_id]
            sampled_caption.append(word)
            if word == '<end>':
                break
        sentence = ' '.join(sampled_caption)

        print("{} : {}".format(i[21:], sentence))
コード例 #25
0
def get_text_caption(image):

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build models
    encoder = EncoderCNN(
        args.embed_size, args.model_type,
        args.mode)  # eval mode (batchnorm uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Load the trained model parameters
    encoder.load_state_dict(
        torch.load(args.model_path + "_" + args.model_type + "/encoder.pt"))
    encoder.eval()
    decoder.load_state_dict(
        torch.load(args.model_path + "_" + args.model_type + "/decoder.pt"))
    decoder.eval()

    # Prepare an image
    image_tensor = image.to(device)

    # Generate an caption from the image
    feature = encoder(image_tensor)

    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids[0].cpu().numpy(
    )  # (1, max_seq_length) -> (max_seq_length)
    print(sampled_ids)

    # Convert word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption)

    return (sentence.split("<start> ")[1].split(" <end>")[0]
            [:-2].capitalize().replace(" , ", ", "))
コード例 #26
0
def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(), 
        transforms.Normalize((0.485, 0.456, 0.406), 
                             (0.229, 0.224, 0.225))])
    
    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build models
    encoder = EncoderCNN(args.embed_size).eval()  # eval mode (batchnorm uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Prepare an image
    image = load_image(args.image, transform)
    image_tensor = image.to(device)
    
    # Generate an caption from the image
    feature = encoder(image_tensor)
    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids[0].cpu().numpy()          # (1, max_seq_length) -> (max_seq_length)
    
    # Convert word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption)
    
    # Print out the image and the generated caption
    print (sentence)
    image = Image.open(args.image)
    plt.imshow(np.asarray(image))
コード例 #27
0
def main(args):
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    #load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    #build models
    encoder = EncoderCNN(args.embed_size).eval()
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    #load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    #prepare an image
    image = load_image(args.image, transform)
    image_tensor = image.to(device)

    #generate a caption from image
    feature = encoder(image_tensor)
    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids[0].cpu().numpy()

    #convert word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption)

    #print image and generated caption
    print(sentence)
    image = Image.open(args.image)
    plt.imshow(np.asarray(image))
コード例 #28
0
def predict(path):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(), 
        transforms.Normalize((0.485, 0.456, 0.406), 
                             (0.229, 0.224, 0.225))])
    
    # Load vocabulary wrapper
    with open('data/vocab.pkl', 'rb') as f:
        vocab = pickle.load(f)

    # Build models
    encoder = EncoderCNN(300).eval()  # eval mode (batchnorm uses moving mean/variance)
    decoder = DecoderRNN(300, 512, vocab, 1)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load('models/encoder-5-3000.ckpt'))
    decoder.load_state_dict(torch.load('models/decoder-5-3000.ckpt'))

    # Prepare an image
    image = load_image(path, transform)
    image_tensor = image.to(device) # 1,3,w,h
    
    # Generate an caption from the image
    feature = encoder(image_tensor)
    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids[0].cpu().numpy()          # (1, max_seq_length) -> (max_seq_length)
    
    # Convert word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption[1:-1])
    print(sentence)
    return sentence
コード例 #29
0
ファイル: sample.py プロジェクト: yazici/Animation2Code
def main(args):
    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build models
    # encoder = EncoderCNN(args.embed_size).eval()  # eval mode (batchnorm uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers, args.max_length)
    # encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Load the trained model parameters
    # encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Prepare an image
    image = torch.from_numpy(
        np.load(args.image_dir + '/' + args.image + '.npy'))
    image_tensor = image.to(device)

    # Generate an caption from the image
    # feature = encoder(image_tensor)
    print(image_tensor.shape)
    sampled_ids = decoder.sample(image_tensor)
    sampled_ids = sampled_ids[0].cpu().numpy(
    )  # (1, max_seq_length) -> (max_seq_length)

    # Convert word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption)

    # Print out the image and the generated caption
    print(sentence)
コード例 #30
0
    def __init__(self,
                 encoder_path,
                 decoder_path,
                 vocab_path,
                 general_predictor_path,
                 embed_size=256,
                 hidden_size=512,
                 num_layers=1,
                 num_general_categories=3):
        # load vocabulary
        with open(vocab_path, 'rb') as f:
            vocab = pickle.load(f)
        # check device
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        # Build models
        encoder = EncoderCNN(embed_size).eval(
        )  # eval mode (batch-norm uses moving mean/variance)
        decoder = DecoderRNN(embed_size, hidden_size, len(vocab),
                             num_layers).eval()
        general_categ_predictor = CategoryPredictor(
            embed_size, num_general_categories).eval()

        encoder = encoder.to(device)
        decoder = decoder.to(device)
        general_categ_predictor = general_categ_predictor.to(device)

        # load model parameters
        encoder.load_state_dict(torch.load(encoder_path, map_location=device))
        decoder.load_state_dict(torch.load(decoder_path, map_location=device))
        general_categ_predictor.load_state_dict(
            torch.load(general_predictor_path, map_location=device))

        self.device = device
        self.vocab = vocab
        self.encoder = encoder
        self.decoder = decoder
        self.general_categ_predictor = general_categ_predictor
コード例 #31
0
def main(args):
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    encoder = EncoderCNN(args.embed_size).eval()
    decoder = DecoderRNN(len(vocab), args.embed_size, args.hidden_size,
                         args.num_layers)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    image = load_image(args.image, transform)
    image_tensor = image.to(device)

    feature = encoder(image_tensor)
    sample_ids = decoder.sample(features)
    sample_ids = sample_ids[0].cpu().numpy()

    sample_caption = []
    for word_id in sample_ids:
        word = vocab.idx2word[word_id]
        sample_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sample_caption)

    print(sentence)
    image = Image.open(args.image)
    plt.imshow(np.asarray(image))
コード例 #32
0
ファイル: training.py プロジェクト: zafartahirov/Coursera
                         batch_size=batch_size,
                         vocab_threshold=vocab_threshold,
                         vocab_from_file=vocab_from_file,
                         cocoapi_loc=COCOPATH)

# The size of the vocabulary.
vocab_size = len(data_loader.dataset.vocab)

# Initialize the encoder and decoder. 
encoder = EncoderCNN(embed_size)
decoder = DecoderRNN(embed_size, hidden_size, vocab_size)

# Move models to GPU if CUDA is available. 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder.to(device)
decoder.to(device)

# Define the loss function. 
criterion = nn.CrossEntropyLoss().cuda() if torch.cuda.is_available() else nn.CrossEntropyLoss()

# TODO #3: Specify the learnable parameters of the model.
params = list(decoder.parameters()) +\
         list(encoder.embed.parameters())  # We don't want to retrain the resnet

# TODO #4: Define the optimizer.
optimizer = torch.optim.RMSprop(params)

# Set the total number of training steps per epoch.
total_step = math.ceil(len(data_loader.dataset.caption_lengths) / data_loader.batch_sampler.batch_size)

################################################################