def directory_to_sentence_matrix(directory): sentence_dict = {} for j in range(1, 7): filename = directory + "/" + str(j) + ".jpg" checkpoint = torch.load( '/home/yerlan/HackNU/a-PyTorch-Tutorial-to-Image-Captioning/BEST_checkpoint_coco_5_cap_per_img_5_min_word_freq.pth.tar' ) decoder = checkpoint['decoder'] decoder = decoder.to(device) decoder.eval() encoder = checkpoint['encoder'] encoder = encoder.to(device) encoder.eval() # Load word map (word2ix) with open( '/home/yerlan/HackNU/a-PyTorch-Tutorial-to-Image-Captioning/WORDMAP_coco_5_cap_per_img_5_min_word_freq.json', 'r') as t: word_map = json.load(t) rev_word_map = {v: k for k, v in word_map.items()} # ix2word sentence_array = [] # Encode, decode with 0attention and beam search for i in range(1, 6): seq, alphas = caption.caption_image_beam_search( encoder, decoder, filename, word_map, i) alphas = torch.FloatTensor(alphas) # Visualize caption and attention of best sequence sentence_array.append( caption.return_sentence(filename, seq, alphas, rev_word_map)) sentence_dict[j] = sentence_array return sentence_dict
def capting(): try: img_obj = request.files['picture'] except: report(traceback.format_exc()) logging.exception('Error with image upload') return 'Error with image upload', 500 try: beam_arg = request.args['beam_size'] #beam = request.files['beam_size'] assert 0 < int(beam_arg) < 10 beam = int(beam_arg) except: report(traceback.format_exc()) logging.exception('Invalid beam input') beam = 5 seq, alphas = caption.caption_image_beam_search(encoder, decoder, img_obj, word_map, beam_size=beam) # seq is a list of numbers try: words = [rev_word_map[ind] for ind in seq] except: report(traceback.format_exc()) return 'can not get word from seq', 500 return jsonify(words)
def caption(image_path, args): if not os.path.exists(args['model']) or not os.path.exists(args['word_map']): print('Pretrained model files not found.\n', args) return None # Load model checkpoint = torch.load(args['model'], map_location=torch.device('cpu')) decoder = checkpoint['decoder'] decoder = decoder.to(device) decoder.eval() encoder = checkpoint['encoder'] encoder = encoder.to(device) encoder.eval() # Load word map (word2idx) with open(args['word_map'], 'r') as j: word_map = json.load(j) # idx2word rev_word_map = {v: k for k, v in word_map.items()} seq, alphas = caption_image_beam_search(encoder, decoder, image_path, word_map, args['beam']) sampled_caption = [rev_word_map[ind] for ind in seq] sampled_caption = [] for ind in seq[1:]: word = rev_word_map[ind] if word == '<end>': break sampled_caption.append(word) sentence = ' '.join(sampled_caption) return sentence
def getDescription(self, img_path, beam_size=5): # Encode, decode with attention and beam search seq, alphas = caption_image_beam_search(self.encoder, self.decoder, img_path, self.word_map, beam_size) alphas = torch.FloatTensor(alphas) #Final predicted sentence words = [self.rev_word_map[ind] for ind in seq] return words
def main(): print("Initializing...") config = Config() # Image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) with open(config.vocab_path, 'rb') as f: vocab = pickle.load(f) print("Vocabulary loaded") #Build models encoder = Encoder().eval( ) # eval mode (batchnorm uses moving mean/variance) decoder = Decoder(vocab_size=len(vocab), use_glove=False, use_bert=config.bert_model, vocab=vocab, device=device, BertTokenizer=tokenizer, BertModel=BertModel) encoder = encoder.to(device) decoder = decoder.to(device) print("Model built") encoder_path = config.encoder_path decoder_path = config.decoder_path encoder.load_state_dict(torch.load(encoder_path), strict=False) decoder.load_state_dict(torch.load(decoder_path), strict=False) print("Model loaded") images = get_val_images(config.validation_path) print(f"Length of images: {len(images)}") print("Validation file loaded") results_data = [] curr_id = 0 for index, image_data in enumerate(images): try: print(f"Index: {index}") image_path = config.val_img_path + image_data['file_name'] image = load_image(image_path, transform=transform) image_tensor = image.to(device) caption_idx, _ = caption_image_beam_search(encoder=encoder, decoder=decoder, word_map=vocab.word2idx, image=image_tensor, device=device) print(f"Caption index: {caption_idx}") except Exception as e: print(e) pass
def captionGen(): response = request.files image = np.array(Image.open(response['image'])) seq, alphas = caption_image_beam_search(encoder, decoder, image, word_map, 5) words = [rev_word_map[ind] for ind in seq] output = '' for i in words[1:-1]: output += ' {}'.format(i) return output
def predict(): # beam = None try: img_obj = request.files['picture'] except: report(traceback.format_exc()) logging.exception('Error with image upload') return 'Error with image upload', 500 try: beam_arg = request.args['beam_size'] #beam = request.files['beam_size'] assert 0 < int(beam_arg) < 10 beam = int(beam_arg) except: report(traceback.format_exc()) logging.exception('Invalid beam input') beam = 5 try: translate_api = request.args['translate_api'] except: report(traceback.format_exc()) logging.exception( 'no translator api specified, using the one in the conf file') start = time.time() seq, alphas = caption.caption_image_beam_search(encoder, decoder, img_obj, word_map, beam_size=beam) end = time.time() cap_elapse = start - end print('caption used ', cap_elapse, 'seconds') # seq is a list of numbers try: words = [rev_word_map[ind] for ind in seq] except: report(traceback.format_exc()) return 'can not get word from seq', 500 # words is a list of string try: start = time.time() r = translate(words, translate_api) end = time.time() trans_elapse = start - end print('translate used:', trans_elapse) except: report(traceback.format_exc()) return 'translate failed', 500 if r.status_code == 500: return 'translation server give 500' return r.text
def run_samples(encoder, decoder, fs, n, path_prefix, word_map, rev_word_map): all_chosen = np.random.choice(len(fs), n) for i in all_chosen: f = fs[i] # Encode, decode with attention and beam search seq, alphas = caption_image_beam_search(encoder, decoder, f, word_map, 5) alphas = torch.FloatTensor(alphas) # Visualize caption and attention of best sequence visualize_att(f, seq, alphas, rev_word_map, f'{path_prefix}_{i}_result.png')
def hello_world(): print(request.files['file']) file = request.files['file'] if file: filename = secure_filename(file.filename) file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename)) img_path = './uploads' + '/' + file.filename #print(img_path) seq, alphas = caption_image_beam_search(encoder, decoder, img_path, word_map, 1) words = [rev_word_map[ind] for ind in seq] words = words[1:] words = words[:len(words) - 1] words = ' '.join(words) print(words) f = open("./templates/result.html", "w") f.write(words) return words
def main_image_caption(image_file, beam_size=5, dont_smooth=''): """ 图片描述生成,调用脚本 :param image_file: 本地文件的uri,或者图片的url :return: """ logging.info('get a new query\t{}'.format(image_file)) res = {'en': '', 'zh': ''} # Encode, decode with attention and beam search seq, alphas = caption.caption_image_beam_search(encoder, decoder, image_file, word_map, beam_size) alphas = torch.FloatTensor(alphas) # 打印image2text结果,翻译成中文 words_l = [rev_word_map[ind] for ind in seq] en_words = ' '.join(words_l[1: -1]) bdt = baidu_translate.BaiDuTranslate() zh_word_dict = bdt.translate(en_words, 'en', 'zh') zh_words = zh_word_dict['trans_result'][0]['dst'] res['en'] = en_words res['zh'] = zh_words logging.info('return a caption\t'.format(json.dumps(res, ensure_ascii=False))) return res
def infer_caption_by_master(img_path, json_path, model, vocab_path, prediction_path, id2class_path): """ :param img_path: :param json_path: :param model: :param vocab_path: :param prediction_path: :param id2class_path: :return: """ model = '/home/dexter/show_attend_tell/BEST_checkpoint_coco_5_cap_per_img_5_min_word_freq.pth.tar' # Load model checkpoint = torch.load(model) decoder = checkpoint['decoder'] decoder = decoder.to(device) decoder.eval() encoder = checkpoint['encoder'] encoder = encoder.to(device) encoder.eval() word_map = '/home/dexter/show_attend_tell/caption data/WORDMAP_coco_5_cap_per_img_5_min_word_freq.json' # Load word map (word2ix) with open(word_map, 'r') as j: word_map = json.load(j) rev_word_map = {v: k for k, v in word_map.items()} # ix2word annotation_path = json_path with open(annotation_path) as json_file: data = json.load(json_file) images = data['images'] # Prediction for every class prediction = [] # Prediction splitted by class prediction_class = {} img_num = len(images) img_gray_num = 0 for i, img in enumerate(images, 1): image_id = img['id'] path = img_path + img['file_name'] _, _, sentence = caption.caption_image_beam_search(rev_word_map, encoder, decoder, path, word_map, beam_size=3) entry = {} entry['image_id'] = image_id entry['caption'] = sentence prediction.append(entry) if i % 100 == 0: print("Inferred on {}/{} images on test set".format(i, img_num)) return prediction
def validate(val_loader, encoder, decoder, criterion): """ Performs one epoch's validation. :param val_loader: DataLoader for validation data. :param encoder: encoder model :param decoder: decoder model :param criterion: loss layer :return: BLEU-4 score """ decoder.eval() # eval mode (no dropout or batchnorm) if encoder is not None: encoder.eval() batch_time = AverageMeter() losses = AverageMeter() top5accs = AverageMeter() start = time.time() references = list( ) # references (true captions) for calculating BLEU-4 score hypotheses = list() # hypotheses (predictions) # Batches for i, (img, caps, caplen) in enumerate(val_loader): seq, alphas = caption_image_beam_search(encoder, decoder, img, word_map, beam_size) if i % (args.log_step / 10) == 0: print('Validation: [{0}/{1}]\t'.format(i, len(val_loader))) # References # caps = caps[sort_ind] # because images were sorted in the decoder img_caps = caps[0].tolist() # img_captions = list( # map(lambda c: [w for w in c if w not in {word_map['<start>'], word_map['<pad>']}], # img_caps)) # remove <start> and pads img_captions = list( map( lambda c: [ w for w in c if w not in { word_map('<start>'), word_map('<end>'), word_map('<pad>') } ], [img_caps])) # remove <start> and pads references.append(img_captions) # Hypotheses hypotheses.append([ w for w in seq if w not in {word_map('<start>'), word_map('<end>'), word_map('<pad>')} ]) assert len(references) == len(hypotheses) # Calculate BLEU-4 scores bleu4 = corpus_bleu(references, hypotheses, emulate_multibleu=True) print('\n * BLEU-4 - {bleu}\n'.format(bleu=bleu4)) return bleu4