def evaluate_model(epoch_size, num_epochs, device): print("Starting evaluate_model(epoch_size=%d, num_epochs=%d, device=%s)" % (epoch_size, num_epochs, device)) start = time.time() vgg16 = load_face_model( "../../caffemodel2pytorch/gender.caffemodel.pt").to(device) img_folder = "../../imdb_crop" mat = scipy.io.loadmat("../../imdb/imdb.mat") genders = mat['imdb'][0][0][3][0] full_paths = mat['imdb'][0][0][2][0] print("There are in total %d" % (len(genders))) path_idx = 0 num_correct = 0 total = 0 for epoch in range(num_epochs): if (path_idx >= len(genders)): print("Stopping at epoch %d since no more data points" % path_idx) break epoch_start = time.time() print("Starting epoch %d" % epoch) img_batch = [] genders_batch = [] while (len(img_batch) < epoch_size): if (path_idx >= len(genders)): break path = os.path.join(img_folder, full_paths[path_idx][0]) img = Image.open(path) # ignore grayscale and the gender label is NaN if (img.mode == "L" or math.isnan(genders[path_idx])): # grayscale path_idx += 1 continue tensor = preprocess(Image.open(path)) img_batch.append(tensor.unsqueeze(0)) genders_batch.append(int(round(genders[path_idx]))) path_idx += 1 img_batch = torch.cat(img_batch, axis=0).to(device) * 255. genders_batch = torch.tensor(genders_batch, device=device, dtype=torch.int64) probs = torch.nn.functional.softmax(vgg16(img_batch), dim=1) preds = probs.argmax(axis=1) num_correct += torch.sum(torch.eq(preds, genders_batch)) total += preds.size(0) epoch_end = time.time() print("Epoch %d took %s" % (epoch, utils.sec2str(int(epoch_end - epoch_start)))) print("acc: ", float(num_correct) / float(total)) end = time.time() print(utils.sec2str(int(end - start)))
def get_features(epoch_size, num_epochs, vgg16, device): print("Starting evaluate_model(epoch_size=%d, num_epochs=%d, device=%s)" % (epoch_size, num_epochs, device)) start = time.time() img_folder = "../../imdb_crop" mat = scipy.io.loadmat("../../imdb/imdb.mat") genders = mat['imdb'][0][0][3][0] full_paths = mat['imdb'][0][0][2][0] print("There are in total %d" % (len(genders))) data = [] labels = [] path_idx = 0 for epoch in range(num_epochs): if (path_idx >= len(genders)): print("Stopping at epoch %d since no more data points" % path_idx) break epoch_start = time.time() print("Starting epoch %d" % epoch) img_batch = [] genders_batch = [] while (len(img_batch) < epoch_size): if (path_idx >= len(genders)): break path = os.path.join(img_folder, full_paths[path_idx][0]) img = Image.open(path) # ignore grayscale and the gender label is NaN if (img.mode == "L" or math.isnan(genders[path_idx])): # grayscale path_idx += 1 continue tensor = preprocess(Image.open(path)) img_batch.append(tensor.unsqueeze(0)) genders_batch.append(int(round(genders[path_idx]))) path_idx += 1 img_batch = torch.cat(img_batch, axis=0).to(device) * 255. genders_batch = torch.tensor(genders_batch, device=device, dtype=torch.int64) data.append(vgg16(img_batch)) labels.append(genders_batch) epoch_end = time.time() print("Epoch %d took %s" % (epoch, utils.sec2str(int(epoch_end - epoch_start)))) end = time.time() print(utils.sec2str(int(end - start))) return torch.cat(data, axis=0), torch.cat(labels, axis=0)
def fetchAP(cursor, table_name, bssid, essid=None): '''Function returns AP records from local database''' bssid = mac2dec(bssid) query = 'SELECT bssid, essid, sec, key, wps \ FROM {} \ WHERE bssid = ?'.format(table_name) if essid: query += ' AND essid = ?' cursor.execute(query, (bssid, essid)) else: cursor.execute(query, (bssid,)) r = cursor.fetchall() entries = [] for k in r: entry = { 'time': strftime("%Y-%m-%d %H:%M:%S", gmtime()), 'bssid': dec2mac(k[0]), 'essid': k[1], 'sec': sec2str(k[2]), 'key': k[3] if k[2] else '<empty>', 'wps': pin2str(k[4]) } entries.append(entry) return entries
def retrieve_c2i(dset, v_dset, capenc, vocab, args): device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") begin = time.time() print("-" * 50) print("retrieving nearest image to: '{}'".format(args.caption), flush=True) cap = vocab.return_idx([args.caption]) length = [torch.sum(torch.ne(cap, vocab.padidx)).item()] with torch.no_grad(): cap = cap.to(device) cap = capenc(cap, length) cap = cap.cpu().numpy() im = dset.embedded["image"] nd = im.shape[0] d = im.shape[1] cpu_index = faiss.IndexFlatIP(d) print("# images: {}, dimension: {}".format(nd, d), flush=True) # cap2im cpu_index.add(im) D, I = cpu_index.search(cap, 5) nnidx = I[0, 0] nnim_id = dset.embedded["img_id"][nnidx] img = v_dset.coco.loadImgs(nnim_id)[0] nnim = io.imread(img['coco_url']) plt.title("nearest neighbor of '{}'".format(args.caption)) plt.axis('off') plt.imshow(nnim) plt.show(block=False) print("retrieval time {}".format(sec2str(time.time() - begin)), flush=True) print("-" * 50) plt.show() return
def retrieve_i2c(dset, v_dset, imenc, vocab, args): device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") begin = time.time() print("-" * 50) print("retrieving nearest caption to: '{}'".format(args.image_path), flush=True) im = Image.open(args.image_path) transform = transforms.Compose([ transforms.Resize((args.imsize, args.imsize)), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) im = transform(im).unsqueeze(0) with torch.no_grad(): im = im.to(device) im = imenc(im) im = im.cpu().numpy() cap = dset.embedded["caption"] nd = cap.shape[0] d = cap.shape[1] cpu_index = faiss.IndexFlatIP(d) print("# captions: {}, dimension: {}".format(nd, d), flush=True) # im2cap cpu_index.add(cap) D, I = cpu_index.search(im, 5) nnidx = I[0, 0] nnann_id = dset.embedded["ann_id"][nnidx] anns = v_dset.coco.loadAnns(nnann_id) print("retrieval time {}".format(sec2str(time.time() - begin)), flush=True) v_dset.coco.showAnns(anns) print("-" * 50) return
def train(epoch, loader, imenc, capenc, optimizer, lossfunc, vocab, args): begin = time.time() maxit = int(len(loader.dataset) / args.batch_size) device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") cumloss = 0 for it, data in enumerate(loader): """image, target, index, img_id""" image = data["image"] caption = data["caption"] img_id = data["img_id"] target = vocab.return_idx(caption) lengths = target.ne(vocab.padidx).sum(dim=1) optimizer.zero_grad() image = image.to(device) target = target.to(device) im_emb = imenc(image) cap_emb = capenc(target, lengths) lossval = lossfunc(im_emb, cap_emb) lossval.backward() optimizer.step() cumloss += lossval.item() if it % args.log_every == args.log_every - 1: print("epoch {} | {} | {:06d}/{:06d} iterations | loss: {:.08f}". format(epoch, sec2str(time.time() - begin), it + 1, maxit, cumloss / args.log_every), flush=True) cumloss = 0 return imenc, capenc, optimizer
def dumpScriptNames(self, os): """ write list of scripts to logger """ store = Store.of(self) pks = store.find(buildset_script, buildset_script.buildset_id == self.id) scripts = pks.order_by(Asc(buildset_script.idx)) counter_len = counter_length(scripts.count()) arr = [ ['idx ', 'name', ' duration'] ] idx = 0 def scriptPath(_pack): """ """ res = [] cur = _pack.script while cur.parent: res.insert(0, cur.parent) cur = cur.parent return res def new(depth, pack_name, pack_idx, dur = None): if dur is None: dur = " -" arr2 = [] arr2.append(pack_idx) arr2.append(" " * (depth * 3) + pack_name) arr2.append(" %s" % (str(dur).split('.')[0],)) return arr2 last_path = [] for pack in scripts: path = scriptPath(pack) depth = len(path) if last_path != path and depth: arr.append(new(depth - 1, path[-1].name, ' '.rjust(counter_len,' '))) last_path = path dur = None # Lookup last duration of script run prevrun = store.find(build_script_status, build_script_status.buildset_script_id == pack.id, build_script_status.exit_code == 0 ).order_by(Desc(build_script_status.id)).first() dur = " -" if prevrun and prevrun.end_time and prevrun.start_time: dur = prevrun.end_time - prevrun.start_time dur = utils.sec2str(dur.seconds) arr.append(new(depth, pack.script.name, str(pack.idx).rjust(counter_len,'0'), dur)) idx += 1 os(table_layout(arr, True, " ", False))
def main(): args = get_arguments() SETTING = Dict(yaml.safe_load(open(os.path.join('arguments',args.arg+'.yaml'), encoding='utf8'))) print(args) args.device = list (map(str,args.device)) os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(args.device) # image transformer transform = transforms.Compose([ transforms.Resize((SETTING.imsize, SETTING.imsize)), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) if args.dataset == 'coco': val_dset = CocoDset(root=SETTING.root_path, img_dir='val2017', ann_dir='annotations/captions_val2017.json', transform=transform) val_loader = DataLoader(val_dset, batch_size=SETTING.batch_size, shuffle=False, num_workers=SETTING.n_cpu, collate_fn=collater) vocab = Vocabulary(max_len=SETTING.max_len) vocab.load_vocab(args.vocab_path) imenc = ImageEncoder(SETTING.out_size, SETTING.cnn_type) capenc = CaptionEncoder(len(vocab), SETTING.emb_size, SETTING.out_size, SETTING.rnn_type) device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") imenc = imenc.to(device) capenc = capenc.to(device) assert args.checkpoint is not None print("loading model and optimizer checkpoint from {} ...".format(args.checkpoint), flush=True) ckpt = torch.load(args.checkpoint, map_location=device) imenc.load_state_dict(ckpt["encoder_state"]) capenc.load_state_dict(ckpt["decoder_state"]) begin = time.time() dset = EmbedDset(val_loader, imenc, capenc, vocab, args) print("database created | {} ".format(sec2str(time.time()-begin)), flush=True) savedir = os.path.join("out", args.config_name) if not os.path.exists(savedir): os.makedirs(savedir, 0o777) image = dset.embedded["image"] caption = dset.embedded["caption"] n_i = image.shape[0] n_c = caption.shape[0] all = np.concatenate([image, caption], axis=0) emb_file = os.path.join(savedir, "embedding_{}.npy".format(n_i)) save_file = os.path.join(savedir, "{}.npy".format(SETTING.method)) vis_file = os.path.join(savedir, "{}.png".format(SETTING.method)) np.save(emb_file, all) print("saved embeddings to {}".format(emb_file), flush=True) dimension_reduction(emb_file, save_file, method=SETTING.method) plot_embeddings(save_file, n_i, vis_file, method=SETTING.method)
def validate(epoch, loader, imenc, capenc, vocab, args, SETTING): begin = time.time() print("begin validation for epoch {}".format(epoch), flush=True) dset = EmbedDset(loader, imenc, capenc, vocab, args) print("val dataset created | {} ".format(sec2str(time.time()-begin)), flush=True) im = dset.embedded["image"] cap = dset.embedded["caption"] nd = im.shape[0] nq = cap.shape[0] d = im.shape[1] cpu_index = faiss.IndexFlatIP(d) print("# images: {}, # captions: {}, dimension: {}".format(nd, nq, d), flush=True) # im2cap cpu_index.add(cap) # calculate every conbination and sort # D = result , I = imgid D, I = cpu_index.search(im, nq) data = {} allrank = [] # TODO: Make more efficient, do not hardcode 5 cap_per_image = 5 # brinf correct answer rank for each sentence(their are 5 each) for i in range(cap_per_image): gt = (np.arange(nd) * cap_per_image).reshape(-1, 1) + i rank = np.where(I == gt)[1] allrank.append(rank) allrank = np.stack(allrank) # minimal rank for ans(best of 5 each) allrank = np.amin(allrank, 0) # how many images were correct bellow @num for rank in [1, 5, 10, 20]: data["i2c_recall@{}".format(rank)] = 100 * np.sum(allrank < rank) / len(allrank) data["i2c_median@r"] = np.median(allrank) + 1 data["i2c_mean@r"] = np.mean(allrank) # cap2im cpu_index.reset() cpu_index.add(im) D, I = cpu_index.search(cap, nd) # TODO: Make more efficient, do not hardcode 5 gt = np.arange(nq).reshape(-1, 1) // cap_per_image allrank = np.where(I == gt)[1] for rank in [1, 5, 10, 20]: data["c2i_recall@{}".format(rank)] = 100 * np.sum(allrank < rank) / len(allrank) data["c2i_median@r"] = np.median(allrank) + 1 data["c2i_mean@r"] = np.mean(allrank) print("-"*50) print("results of cross-modal retrieval") for key, val in data.items(): print("{}: {}".format(key, val), flush=True) print("-"*50) return data
def main(): args = get_arguments() SETTING = Dict( yaml.safe_load( open(os.path.join('arguments', args.arg + '.yaml'), encoding='utf8'))) print(args) args.device = list(map(str, args.device)) os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(args.device) transform = transforms.Compose([ transforms.Resize((SETTING.imsize, SETTING.imsize)), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) if args.dataset == 'coco': val_dset = CocoDset(root=SETTING.root_path, img_dir='val2017', ann_dir='annotations/captions_val2017.json', transform=transform) val_loader = DataLoader(val_dset, batch_size=SETTING.batch_size, shuffle=False, num_workers=SETTING.n_cpu, collate_fn=collater) vocab = Vocabulary(max_len=SETTING.max_len) vocab.load_vocab(args.vocab_path) imenc = ImageEncoder(SETTING.out_size, SETTING.cnn_type) capenc = CaptionEncoder(len(vocab), SETTING.emb_size, SETTING.out_size, SETTING.rnn_type) device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") imenc = imenc.to(device) capenc = capenc.to(device) assert SETTING.checkpoint is not None print("loading model and optimizer checkpoint from {} ...".format( SETTING.checkpoint), flush=True) ckpt = torch.load(SETTING.checkpoint) imenc.load_state_dict(ckpt["encoder_state"]) capenc.load_state_dict(ckpt["decoder_state"]) begin = time.time() dset = EmbedDset(val_loader, imenc, capenc, vocab, args) print("database created | {} ".format(sec2str(time.time() - begin)), flush=True) retrieve_i2c(dset, val_dset, args.image_path, imenc, transform) retrieve_c2i(dset, val_dset, args.output_dir, args.caption, capenc, vocab)
def validate(epoch, loader, imenc, capenc, vocab, args): begin = time.time() print("begin validation for epoch {}".format(epoch), flush=True) dset = EmbedDataset(loader, imenc, capenc, vocab, args) print("val dataset created | {} ".format(sec2str(time.time() - begin)), flush=True) im = dset.embedded["image"] cap = dset.embedded["caption"] img_ids = dset.embedded["img_id"] ann_ids = dset.embedded["ann_id"] #print(len(img_ids)) # 5000 #print(len(ann_ids)) # 25000 nd = im.shape[0] nq = cap.shape[0] d = im.shape[1] cpu_index = faiss.IndexFlatIP(d) print("# images: {}, # captions: {}, dimension: {}".format(nd, nq, d), flush=True) # im2cap cpu_index.add(cap) D, I = cpu_index.search(im, nq) data = {} allrank = [] for i in range(5): gt = (np.arange(nd) * 5).reshape(-1, 1) + i rank = np.where(I == gt)[1] allrank.append(rank) allrank = np.stack(allrank) allrank = np.amin(allrank, 0) for rank in [1, 5, 10, 20]: data["i2c_recall@{}".format(rank)] = 100 * np.sum( allrank < rank) / len(allrank) data["i2c_median@r"] = np.median(allrank) + 1 # cap2im cpu_index.reset() cpu_index.add(im) D, I = cpu_index.search(cap, nd) gt = np.arange(nq).reshape(-1, 1) // 5 allrank = np.where(I == gt)[1] for rank in [1, 5, 10, 20]: data["c2i_recall@{}".format(rank)] = 100 * np.sum( allrank < rank) / len(allrank) data["c2i_median@r"] = np.median(allrank) + 1 print("-" * 50) print("results of cross-modal retrieval") for key, val in data.items(): print("{}: {}".format(key, val), flush=True) print("-" * 50) return data
def dimension_reduction(numpyfile, dstfile, method="PCA"): all = np.load(numpyfile) begin = time.time() print("conducting {} on data...".format(method), flush=True) if method == "T-SNE": all = TSNE(n_components=2).fit_transform(all) elif method == "PCA": all = PCA(n_components=2).fit_transform(all) else: raise NotImplementedError() print("done | {} ".format(sec2str(time.time()-begin)), flush=True) np.save(dstfile, all) print("saved {} embeddings to {}".format(method, dstfile), flush=True)
def retrieve_c2i(dset, v_dset, savedir, caption, capenc, vocab, k=1): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") begin = time.time() print("-" * 50) print("source caption: '{}'".format(caption), flush=True) cap = vocab.return_idx([caption]) length = torch.tensor([torch.sum(torch.ne(cap, vocab.padidx)).item() ]).to(device, dtype=torch.long) with torch.no_grad(): cap = cap.to(device) cap = capenc(cap, length) cap = cap.cpu().numpy() im = dset.embedded["image"] nd = im.shape[0] d = im.shape[1] cpu_index = faiss.IndexFlatIP(d) print("# images: {}, dimension: {}".format(nd, d), flush=True) # cap2im cpu_index.add(im) D, I = cpu_index.search(cap, k) print("retrieval time {}".format(sec2str(time.time() - begin)), flush=True) nnimid = [] for i in range(k): nnidx = I[0, i] nnim_id = dset.embedded["img_id"][nnidx] nnimid.append(nnim_id) img = v_dset.coco.loadImgs(nnimid) print("-" * 50) print("{} nearest neighbors of '{}'".format(k, caption)) if k == 1: plt.figure(figsize=(8, 10)) nnim = io.imread(img[0]['coco_url']) plt.imshow(nnim) plt.axis('off') elif k > 1: fig, axs = plt.subplots(1, k, figsize=(8 * k, 10)) fig.suptitle("retrieved {} nearest neighbors of '{}'".format( k, caption)) for i in range(k): nnim = io.imread(img[i]['coco_url']) axs[i].imshow(nnim) axs[i].axis('off') else: raise #plt.show(block=False) #plt.show() if not os.path.exists(savedir): os.makedirs(savedir) plt.savefig(os.path.join(savedir, "output.png")) print("-" * 50)
def main(): args = parse_args() transform = transforms.Compose([ transforms.Resize((args.imsize, args.imsize)), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) if args.dataset == 'coco': val_dset = CocoDataset(root=args.root_path, imgdir='val2017', jsonfile='annotations/captions_val2017.json', transform=transform, mode='all') val_loader = DataLoader(val_dset, batch_size=args.batch_size, shuffle=False, num_workers=args.n_cpu, collate_fn=collater_eval) vocab = Vocabulary(max_len=args.max_len) vocab.load_vocab(args.vocab_path) imenc = ImageEncoder(args.out_size, args.cnn_type) capenc = CaptionEncoder(len(vocab), args.emb_size, args.out_size, args.rnn_type) device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") imenc = imenc.to(device) capenc = capenc.to(device) assert args.checkpoint is not None print("loading model and optimizer checkpoint from {} ...".format( args.checkpoint), flush=True) ckpt = torch.load(args.checkpoint) imenc.load_state_dict(ckpt["encoder_state"]) capenc.load_state_dict(ckpt["decoder_state"]) begin = time.time() dset = EmbedDataset(val_loader, imenc, capenc, vocab, args) print("database created | {} ".format(sec2str(time.time() - begin)), flush=True) retrieve_i2c(dset, val_dset, imenc, vocab, args) retrieve_c2i(dset, val_dset, capenc, vocab, args)
def load_vocab(self, textfile): before = time.time() print("building vocabulary...", flush=True) # Append each line of txt to list with open(textfile, 'r') as f: sentences = f.readlines() # divide to words and punctuation sent_proc = list(map(self.text_proc.preprocess, sentences)) # make vocab dictionary self.text_proc.build_vocab(sent_proc, min_freq=self.min_freq) self.len = len(self.text_proc.vocab) # padding index self.padidx = self.text_proc.vocab.stoi["<pad>"] print("done building vocabulary, minimum frequency is {} times".format( self.min_freq), flush=True) print("# of words in vocab: {} | {}".format( self.len, sec2str(time.time() - before)), flush=True)
def load_vocab(self): time_start = time.time() print('building vocabulary...', flush=True) self.text_json_df = [] for i in range(len(self.annotation_file)): sentences = self.load_json_text( os.path.join(self.annotation_path, self.annotation_file[i], self.caption_all_json[i])) # print(sentences[:3]) sent_proc = list(map(self.text_proc.preprocess, sentences)) # print(sent_proc[:3]) print('number of sentences:', len(sent_proc)) # self.text_proc.build_vocab(sent_proc, min_freq=self.min_freq) self.text_proc.build_vocab(sent_proc, min_freq=self.min_freq, vectors=torchtext.vocab.GloVe(name='840B', dim=300)) vocab_proc = self.text_proc.vocab # print('最頻出単語top10:', self.text_proc.vocab.freqs.most_common(10)) word_embeddings = self.text_proc.vocab.vectors # print('self.text_proc.vocab.vectors.size():', # self.text_proc.vocab.vectors.size()) self.len = len(self.text_proc.vocab) self.padidx = self.text_proc.vocab.stoi['<pad>'] print("done building vocabulary, minimum frequency is {} times".format( self.min_freq), flush=True) print("# of words in vocab: {} | {}".format( self.len, sec2str(time.time() - time_start)), flush=True) print( '================================================================================' ) return vocab_proc, word_embeddings
def load_vocab(self, textfile): """ build vocabulary from textfile. """ before = time.time() print("building vocabulary...", flush=True) with open(textfile, "r") as f: sentences = f.readlines() sent_proc = list(map(self.text_proc.preprocess, sentences)) self.text_proc.build_vocab(sent_proc, min_freq=self.min_freq) self.len = len(self.text_proc.vocab) self.padidx = self.text_proc.vocab.stoi["<pad>"] self.bosidx = self.text_proc.vocab.stoi["<bos>"] print( "done building vocabulary, minimum frequency is {} times".format( self.min_freq), flush=True, ) print( "# of words in vocab: {} | {}".format( self.len, sec2str(time.time() - before)), flush=True, )
def retrieve_i2c(dset, v_dset, impath, imenc, transform, k=1): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") im = Image.open(impath) print("-" * 50) plt.title("source image") plt.imshow(np.asarray(im)) plt.axis('off') plt.show(block=False) plt.show() im = transform(im).unsqueeze(0) begin = time.time() with torch.no_grad(): im = im.to(device) im = imenc(im) im = im.cpu().numpy() cap = dset.embedded["caption"] nd = cap.shape[0] d = cap.shape[1] cpu_index = faiss.IndexFlatIP(d) print("# captions: {}, dimension: {}".format(nd, d), flush=True) # im2cap cpu_index.add(cap) D, I = cpu_index.search(im, k) nnann = [] for i in range(k): nnidx = I[0, i] ann_ids = [a for ids in dset.embedded["ann_id"] for a in ids] nnann_id = ann_ids[nnidx] nnann.append(nnann_id) anns = v_dset.coco.loadAnns(nnann) print("retrieval time {}".format(sec2str(time.time() - begin)), flush=True) print("-" * 50) print("{} nearest neighbors of image:".format(k)) v_dset.coco.showAnns(anns) print("-" * 50)
spacy = spacy.load('en_core_web_sm') device = torch.device("cuda" if torch.cuda.is_available() else "cpu") start_now = datetime.datetime.now() print(start_now.strftime('%Y/%m/%d %H:%M:%S')) ### Test ### print('----- Test -----') start1 = time.time() print('Loading test dataset...') start2 = time.time() my_test_dataset = MyDataset_BMN(mode='test') end2 = sec2str(time.time() - start2) print('Finished loading test dataset. | {}'.format(end2)) test_batch_size = 1 # ['LSTM', 'GRU', 'Transformer', 'TF'] rnn_model = 'TF' attn_mode = 'multihead' # ['simple', 'multihead'] cap_hidden_size = vid_hidden_size = 512 cap_num_layers = vid_num_layers = 2 cap_bidirectional = vid_bidirectional = True common_size = 256 grd_mode = 'simple' # ['simple', 'multi'] num_workers = 4 test_data_loader = torch.utils.data.DataLoader(
def BMN_train(train_dataloader, val_dataloader, bmn, criterion, optimizer, lr_scheduler, CONFIG, args, device, date_path): train_start = time.time() # make result directory result_path = os.path.join(CONFIG.BMN_result_dir, date_path) if os.path.exists(result_path): shutil.rmtree(result_path) os.makedirs(result_path, exist_ok=True) # make checkpoint directory checkpoint_path = CONFIG.BMN_checkpoint_dir if os.path.exists(checkpoint_path): shutil.rmtree(checkpoint_path) os.makedirs(checkpoint_path, exist_ok=True) CONFIG_df = pd.DataFrame.from_dict(CONFIG, orient='index') CONFIG_df.to_csv(os.path.join(result_path, 'config.csv'), header=False) best_loss = 1e10 train_loss_list = [] train_pem_reg_loss_list = [] train_pem_cls_loss_list = [] train_tem_loss_list = [] val_loss_list = [] val_pem_reg_loss_list = [] val_pem_cls_loss_list = [] val_tem_loss_list = [] lr_list = [] for epoch in range(CONFIG.BMN_epoch_num): epoch_start = time.time() # train print('-' * 5, 'train', '-' * 5) bmn.train() train_loss = 0 train_pem_reg_loss = 0 train_pem_cls_loss = 0 train_tem_loss = 0 for i, train_data in enumerate(train_dataloader): input_data = train_data['video'] gt_confidence_map = train_data['confidence_map'] gt_start = train_data['start'] gt_end = train_data['end'] input_data = input_data.to(device) gt_confidence_map = gt_confidence_map.to(device) gt_start = gt_start.to(device) gt_end = gt_end.to(device) optimizer.zero_grad() confidence_map, start, end = bmn(input_data) loss = criterion(confidence_map, start, end, gt_confidence_map, gt_start, gt_end) loss[0].backward() train_loss += loss[0].cpu().detach().numpy() train_pem_reg_loss += loss[1].cpu().detach().numpy() train_pem_cls_loss += loss[2].cpu().detach().numpy() train_tem_loss += loss[3].cpu().detach().numpy() optimizer.step() if i % 200 == 0: print(epoch, i, loss[0]) # break train_loss /= len(train_dataloader) train_pem_reg_loss /= len(train_dataloader) train_pem_cls_loss /= len(train_dataloader) train_tem_loss /= len(train_dataloader) train_loss_list.append(train_loss) train_pem_reg_loss_list.append(train_pem_reg_loss) train_pem_cls_loss_list.append(train_pem_cls_loss) train_tem_loss_list.append(train_tem_loss) print(sec2str(time.time() - epoch_start)) # validation print('-' * 5, 'validation', '-' * 5) bmn.eval() val_loss = 0 val_pem_reg_loss = 0 val_pem_cls_loss = 0 val_tem_loss = 0 with torch.no_grad(): for i, val_data in enumerate(val_dataloader): input_data = val_data['video'] gt_confidence_map = val_data['confidence_map'] gt_start = val_data['start'] gt_end = val_data['end'] input_data = input_data.to(device) gt_confidence_map = gt_confidence_map.to(device) gt_start = gt_start.to(device) gt_end = gt_end.to(device) confidence_map, start, end = bmn(input_data) loss = criterion(confidence_map, start, end, gt_confidence_map, gt_start, gt_end) val_loss += loss[0].cpu().detach().numpy() val_pem_reg_loss += loss[1].cpu().detach().numpy() val_pem_cls_loss += loss[2].cpu().detach().numpy() val_tem_loss += loss[3].cpu().detach().numpy() val_loss /= len(val_dataloader) val_pem_reg_loss /= len(val_dataloader) val_pem_cls_loss /= len(val_dataloader) val_tem_loss /= len(val_dataloader) val_loss_list.append(val_loss) val_pem_reg_loss_list.append(val_pem_reg_loss) val_pem_cls_loss_list.append(val_pem_cls_loss) val_tem_loss_list.append(val_tem_loss) save_checkpoint(checkpoint_path, epoch, bmn, optimizer, val_loss, lr_scheduler) if val_loss <= best_loss: best_loss = val_loss save_checkpoint(result_path, epoch, bmn, optimizer, val_loss, lr_scheduler) lr_list.append(optimizer.param_groups[0]['lr']) lr_scheduler.step(train_loss) epoch_end = time.time() - epoch_start print( 'Epoch: [{}/{}], Time: {}, train_loss: {loss:.4f}, val_loss: {val_loss:.4f}' .format(epoch + 1, CONFIG.BMN_epoch_num, sec2str(epoch_end), loss=train_loss, val_loss=val_loss)) print( 'train: [pem_reg_loss: {}, pem_cls_loss: {}, tem_loss: {}]'.format( train_pem_reg_loss, train_pem_cls_loss, train_tem_loss)) # save BMN log log_dict = { 'epoch': list(range(epoch + 1)), 'learning_rate': lr_list, 'train_loss': train_loss_list, 'train_pem_reg_loss': train_pem_reg_loss_list, 'train_pem_cls_loss': train_pem_cls_loss_list, 'train_tem_loss': train_tem_loss_list, 'val_loss': val_loss_list, 'val_pem_reg_loss': val_pem_reg_loss_list, 'val_pem_cls_loss': val_pem_cls_loss_list, 'val_tem_loss': val_tem_loss_list, } log_df = pd.DataFrame.from_dict(log_dict).set_index('epoch') log_df.to_csv(os.path.join(result_path, 'log.csv'), mode='w') plt.figure() plt.plot(train_loss_list, label='train') plt.plot(val_loss_list, label='val') plt.yscale('log') plt.legend() plt.savefig(os.path.join(result_path, 'loss.png')) plt.close() # save figure of loss log train_end = time.time() - train_start print('finised train: {}'.format(sec2str(train_end)))
# weights and biases if not args.no_wandb: wandb.init( config=CONFIG, project='two-stage-Temporal Moment Retrieval', job_type='training', ) # date path date_path = date # config_name = str(args.config)[14:-5] # date_path = os.path.join(date, config_name) BMN_main(CONFIG, args, device, date) if __name__ == '__main__': start_main = time.time() start_now = datetime.datetime.now() # date = start_now.strftime('%Y-%m-%d/%H') # date = start_now.strftime('%Y-%m-%d') date = start_now.strftime('%Y-%m') print(start_now.strftime('%Y/%m/%d %H:%M:%S')) main(date) end_main = sec2str(time.time() - start_main) end_now = datetime.datetime.now() print('Finished main.py! | {} | {}'.format( end_main, end_now.strftime('%Y/%m/%d %H:%M:%S'))) print('=' * 70)
def main(): args = parse_args() transform = transforms.Compose([ transforms.Resize((args.imsize, args.imsize)), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) if args.dataset == "coco": val_dset = CocoDataset( root=args.root_path, split="val", transform=transform, ) val_loader = DataLoader( val_dset, batch_size=args.batch_size, shuffle=False, num_workers=args.n_cpu, collate_fn=collater, ) vocab = Vocabulary(max_len=args.max_len) vocab.load_vocab(args.vocab_path) imenc = ImageEncoder(args.out_size, args.cnn_type) capenc = CaptionEncoder(len(vocab), args.emb_size, args.out_size, args.rnn_type) device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") imenc = imenc.to(device) capenc = capenc.to(device) assert args.checkpoint is not None print("loading model and optimizer checkpoint from {} ...".format( args.checkpoint), flush=True) ckpt = torch.load(args.checkpoint, map_location=device) imenc.load_state_dict(ckpt["encoder_state"]) capenc.load_state_dict(ckpt["decoder_state"]) begin = time.time() dset = EmbedDataset(val_loader, imenc, capenc, vocab, args) print("database created | {} ".format(sec2str(time.time() - begin)), flush=True) savedir = os.path.join("out", args.config_name) if not os.path.exists(savedir): os.makedirs(savedir, 0o777) image = dset.embedded["image"] caption = dset.embedded["caption"] n_i = image.shape[0] n_c = caption.shape[0] all = np.concatenate([image, caption], axis=0) emb_file = os.path.join(savedir, "embedding_{}.npy".format(n_i)) save_file = os.path.join(savedir, "{}.npy".format(args.method)) vis_file = os.path.join(savedir, "{}.png".format(args.method)) np.save(emb_file, all) print("saved embeddings to {}".format(emb_file), flush=True) dimension_reduction(emb_file, save_file, method=args.method) plot_embeddings(save_file, n_i, vis_file, method=args.method)
def __len__(self): return len(self.video_list) if __name__ == '__main__': start = time.time() my_dataset = ActivityNet_Captions_BMN_Dataset(mode='test') print(len(my_dataset)) my_dataset = Charades_STA_BMN_Dataset(mode='test') print(len(my_dataset)) my_dataset = TACoS_BMN_Dataset(mode='test') print(len(my_dataset)) my_dataset = BMN_Dataset(mode='test') print(len(my_dataset)) print(sec2str(time.time() - start)) data_loader = torch.utils.data.DataLoader(my_dataset, batch_size=4, shuffle=False, drop_last=True, num_workers=0, collate_fn=BMN_collate_fn) print('=' * 70) print(len(data_loader)) for i, data in enumerate(data_loader): if i == 1: print(data['video_id']) print(data['video'].size()) print(data['video_length'].size()) print(data['start'].size()) # print(data['start'])
from utils import sec2str, model_state_dict, collate_fn import spacy spacy = spacy.load('en_core_web_sm') device = torch.device("cuda" if torch.cuda.is_available() else "cpu") start_now = datetime.datetime.now() print(start_now.strftime('%Y/%m/%d %H:%M:%S')) start0 = time.time() print('Setting dataset...') MyFile() MyTestFile() MakeDataset() end0 = sec2str(time.time() - start0) print('Finished setting dataset. | {}'.format(end0)) print('================================================================================') print('----- Train & Validation -----') start1 = time.time() print('Loading train dataset...') start2 = time.time() train_dataset = MyDataset_BMN(mode='train') end2 = sec2str(time.time() - start2) print('Finished loading train dataset. | {}'.format(end2)) print('Loading validation dataset...') start3 = time.time() val_dataset = MyDataset_BMN(mode='val')
def train(epoch, loader, imenc, capenc, optimizer, lossfunc, vocab, args, SETTING): begin = time.time() # max iteration_num maxit = int(len(loader.dataset) / SETTING.batch_size) device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") cumloss = 0 for it, data in enumerate(loader): """image, target, index, img_id""" # batch_size * img_size image = data["image"] # batch_size * sentence * 5 caption = data["caption"] # chose 1 random caption from 5 caption = [i[np.random.randint(0, len(i))] for i in caption] img_id = data["img_id"] # caption sentence → id target = vocab.return_idx(caption) # lengths of each sentence lengths = target.ne(vocab.padidx).sum(dim=1) optimizer.zero_grad() image = image.to(device) target = target.to(device) lengths = lengths.to(device) im_emb = imenc(image) cap_emb = capenc(target, lengths) lossval = lossfunc(im_emb, cap_emb) lossval.backward() # clip gradient norm if SETTING.grad_clip > 0: clip_grad_norm_(imenc.parameters(), SETTING.grad_clip) clip_grad_norm_(capenc.parameters(), SETTING.grad_clip) optimizer.step() cumloss += lossval.item() if it % SETTING.log_every == SETTING.log_every-1: print("epoch {} | {} | {:06d}/{:06d} iterations | loss: {:.08f}".format(epoch, sec2str(time.time()-begin), it+1, maxit, cumloss/SETTING.log_every), flush=True) cumloss = 0