else: restore = int(args.restore) tasks = tasks.split(',') tasks = [t.strip() for t in tasks] batch_sizes = batch_sizes.split(',') batch_sizes = [int(b.strip()) for b in batch_sizes] if len(tasks) != len(batch_sizes): raise Exception( 'Number of tasks provided does not match the number of batch sizes provided.' ) n_gpus = int(args.n_gpus) n_tasks = len(tasks) * n_jobs shared_model = omninet.OmniNet(gpu_id=0) if restore != -1: shared_model.restore(model_save_path, restore) else: restore = 0 shared_model = shared_model.to(0) shared_model.share_memory() counters = [Counter(restore) for i in range(len(tasks))] barrier = mp.Barrier(n_tasks) start = int(restore / n_jobs) # Declare training processes for multi-gpu hogwild training processes = [] for i in range(n_tasks): #If more than one GPU is used, use first GPU only for model sharing if n_gpus > 1:
def train(shared_model, task, batch_size, train_steps, gpu_id, start, restore, counter, barrier=None, save_interval=None, eval_interval=None, log=True): log_dir = 'logs/%s' % task if not os.path.exists(log_dir): os.makedirs(log_dir) if (log == True): summary_writer = SummaryWriter(log_dir) # Create local model torch.manual_seed(int(random.random() * 1000)) if gpu_id > 0: model = omninet.OmniNet(gpu_id=gpu_id) model = model.cuda(gpu_id) else: #For GPU 0, use the shared model always model = shared_model if task == 'caption': DL, val_dl = dl.coco_cap_batchgen(caption_dir=caption_dir, image_dir=coco_images, num_workers=8, batch_size=batch_size) optimizer = ScheduledOptim(Adam(filter(lambda x: x.requires_grad, shared_model.parameters()), betas=(0.9, 0.98), eps=1e-09), 512, 16000, restore, init_lr=0.02) elif task == 'vqa': DL, val_dl = dl.vqa_batchgen(vqa_dir, coco_images, num_workers=8, batch_size=batch_size) optimizer = ScheduledOptim(Adam(filter(lambda x: x.requires_grad, shared_model.parameters()), betas=(0.9, 0.98), eps=1e-09), 512, 16000, restore, max_lr=0.0001, init_lr=0.02) elif task == 'hmdb': DL, val_dl = dl.hmdb_batchgen(hmdb_data_dir, hmdb_process_dir, num_workers=8, batch_size=batch_size, test_batch_size=int(batch_size / 4), clip_len=16) optimizer = ScheduledOptim(Adam(filter(lambda x: x.requires_grad, shared_model.parameters()), betas=(0.9, 0.98), eps=1e-09), 512, 16000, restore, max_lr=0.0001, init_lr=0.02) elif task == 'penn': DL, val_dl, test_dl = dl.penn_dataloader( penn_data_dir, batch_size=batch_size, test_batch_size=int(batch_size / 2), num_workers=4, vocab_file='conf/penn_vocab.json') optimizer = ScheduledOptim(Adam(filter(lambda x: x.requires_grad, shared_model.parameters()), betas=(0.9, 0.98), eps=1e-09), 512, 16000, restore, init_lr=0.02) model = model.train() for i in range(start, train_steps): model.zero_grad() if barrier is not None: barrier.wait() if gpu_id > 0: with torch.cuda.device(gpu_id): model.load_state_dict(shared_model.state_dict()) # Calculate loss step = counter.increment() if task == 'caption': if (log and eval_interval is not None and i % eval_interval == 0): model = model.eval() val_loss = 0 val_acc = 0 print('-' * 100) print('Evaluation step') for b in tqdm(val_dl): imgs = b['img'] if gpu_id >= 0: imgs = imgs.cuda(device=gpu_id) captions = b['cap'] # In val mode we do not pass the targets for prediction. We use it only for loss calculation _, loss, acc = r.image_caption(model, imgs, targets=captions, mode='val', return_str_preds=True) val_loss += float(loss.detach().cpu().numpy()) val_acc += acc val_loss /= len(val_dl) val_acc = (val_acc / len(val_dl)) summary_writer.add_scalar('Val_loss', val_loss, step) print('Step %d, COCO validation loss: %f, Accuracy %f %%' % (step, val_loss, val_acc)) print('-' * 100) model = model.train() batch = next(DL) if gpu_id >= 0: imgs = batch['img'].cuda(device=gpu_id) else: imgs = batch['img'] captions = batch['cap'] _, loss, acc = r.image_caption(model, imgs, targets=captions) loss.backward() loss = loss.detach() if log: summary_writer.add_scalar('Loss', loss, step) print('Step %d, Caption Loss: %f, Accuracy: %f %%' % (step, loss, acc)) elif task == 'vqa': if (log and eval_interval is not None and i % eval_interval == 0): model = model.eval() val_loss = 0 val_acc = 0 print('-' * 100) print('Evaluation step') for b in tqdm(val_dl): imgs = b['img'] answers = b['ans'] if gpu_id >= 0: imgs = imgs.cuda(device=gpu_id) answers = answers.cuda(device=gpu_id) questions = b['ques'] # In val mode we do not pass the targets for prediction. We use it only for loss calculation pred, loss, acc = r.vqa(model, imgs, questions, targets=answers, mode='val', return_str_preds=True) val_loss += float(loss.detach().cpu().numpy()) val_acc += acc val_loss /= len(val_dl) val_acc = (val_acc / len(val_dl)) summary_writer.add_scalar('Val_loss', val_loss, step) print('Step %d, VQA validation loss: %f, Accuracy %f %%' % (step, val_loss, val_acc)) print('-' * 100) model = model.train() continue batch = next(DL) if gpu_id >= 0: imgs = batch['img'].cuda(device=gpu_id) answers = batch['ans'].cuda(device=gpu_id) else: imgs = batch['img'] answers = batch['ans'] questions = batch['ques'] _, loss, acc = r.vqa(model, imgs, questions, targets=answers) loss.backward() loss = loss.detach() if log: summary_writer.add_scalar('Loss', loss, step) print('Step %d, VQA Loss: %f, Accuracy: %f %%' % (step, loss, acc)) elif task == 'hmdb': if (log and eval_interval is not None and i % eval_interval == 0): model = model.eval() val_loss = 0 val_acc = 0 print('-' * 100) print('Evaluation step') for b in tqdm(val_dl): vid, labels = b if gpu_id >= 0: vid = vid.cuda(device=gpu_id) labels = labels.cuda(device=gpu_id) _, loss, acc = r.hmdb(model, vid, targets=labels, mode='val') val_loss += float(loss.detach().cpu().numpy()) val_acc += acc val_loss /= len(val_dl) val_acc = (val_acc / len(val_dl)) summary_writer.add_scalar('Val_loss', val_loss, step) print('Step %d, HMDB validation loss: %f, Accuracy %f %%' % (step, val_loss, val_acc)) print('-' * 100) model = model.train() continue vid, labels = next(DL) if gpu_id >= 0: vid = vid.cuda(device=gpu_id) labels = labels.cuda(device=gpu_id) _, loss, acc = r.hmdb(model, vid, targets=labels, return_str_preds=True) loss.backward() loss = loss.detach() if log: summary_writer.add_scalar('Loss', loss, step) print('Step %d, HMDB Loss: %f, Accuracy: %f %%' % (step, loss, acc)) elif task == 'penn': if (log and eval_interval is not None and i % eval_interval == 0): model = model.eval() val_loss = 0 val_acc = 0 print('-' * 100) print('Evaluation step') for b in tqdm(test_dl): en = b['text'] targets = b['tokens'] pad_id = b['pad_id'] pad_mask = b['pad_mask'] if gpu_id >= 0: targets = targets.to(gpu_id) pad_mask = pad_mask.to(gpu_id) _, loss, acc = r.penn(model, en, target_pad_mask=pad_mask, pad_id=pad_id, targets=targets, mode='val', return_str_preds=True) loss = loss.detach() val_loss += float(loss.cpu().numpy()) val_acc += acc val_loss /= len(val_dl) val_acc = (val_acc / len(val_dl)) summary_writer.add_scalar('Val_loss', val_loss, step) print('Step %d, PENN validation loss: %f, Accuracy %f %%' % (step, val_loss, val_acc)) print('-' * 100) model = model.train() batch = next(DL) en = batch['text'] targets = batch['tokens'] pad_id = batch['pad_id'] pad_mask = batch['pad_mask'] if gpu_id >= 0: targets = targets.to(gpu_id) pad_mask = pad_mask.to(gpu_id) _, loss, acc = r.penn(model, en, pad_id=pad_id, targets=targets, target_pad_mask=pad_mask) loss.backward() loss = loss.detach() if log: summary_writer.add_scalar('Loss', loss, step) print('Step %d, PENN Loss: %f, Accuracy: %f %%' % (step, loss, acc)) # End Calculate loss if gpu_id > 0: ensure_shared_grads(model, shared_model, gpu_id) optimizer.step() # Save model if (save_interval != None and (i + 1) % save_interval == 0): shared_model.save(model_save_path, step) sys.stdout.flush()
def vision_and_language_prediction(cfg, task, image=None, text=None, video=None): model_file = cfg.OMNINET.MODEL verbose = cfg.OMNINET.VERBOSE penn_vocab_file = os.path.join(cfg.OMNINET.BASE, 'conf/penn_vocab.json') vqa_vocab_file = os.path.join(cfg.OMNINET.BASE, 'conf/vqa_vocab.pkl') hmdb_labels_file = os.path.join(cfg.OMNINET.BASE, 'conf/hmdblabels.txt') # if verbose==False: # sys.stdout = open(os.devnull, 'w') #Load Omninet model model = omninet.OmniNet(gpu_id=0) model.restore_file(model_file) model = model.to(0) model = model.eval() model.reset(1) if image is not None: image = extract_pixels_from_image(image) print(f'Image encoding input tensor shape is {image.size()}') print( f'Image encoding input tensor size is {get_tensor_size(image_encodings):.3f}' ) image = image.to(0) image_start = time.time() model.encode_images(image) print(f'Encode image took {time.time() - image_start:.2f}') if text is not None: text_start = time.time() model.encode_englishtexts([text]) print(f'Encode text took {time.time() - text_start:.2f}') if video is not None: video = extract_frames_from_video(video, cfg.OMNINET.EXTRACT_FREQUENCY, cfg.OMNINET.VIDEO_RESIZE_HEIGHT, cfg.OMNINET.VIDEO_RESIZE_WIDTH, cfg.OMNINET.CROP_SIZE, cfg.OMNINET.CLIP_LEN) # print(f'Video encoding input tensor shape is {video.size()}') print( f'Video encoding input tensor size is {get_tensor_size(video):.3f}' ) video = video.to(0) video_start = time.time() model.encode_videos(video) print(f'Encode videos took {time.time() - video_start:.2f}') # if verbose == False: # sys.setdout = sys.__stdout__ result = "" start = time.time() if task == 'caption': prediction = model.decode_greedy('IMAGE_CAPTION', num_steps=100) prediction = prediction.argmax(-1) prediction = model.english_language_perph.decode_tokens(prediction) result += f'Caption Prediction: {prediction[0]}' elif task == 'hmdb': prediction = model.decode_greedy('HMDB', num_steps=1) prediction = prediction.argmax(-1).cpu().tolist()[0][0] with open(hmdb_labels_file, 'r') as f: lines = f.readlines() id_to_label = dict() for l in lines: id, label = l.split(' ') id_to_label[id] = label prediction = id_to_label[str(prediction)] result += f'Action recognition prediction: {prediction}' elif task == 'vqa': prediction = model.decode_greedy('VQA', num_steps=1) prediction = prediction.argmax(-1).cpu().tolist()[0][0] with open(vqa_vocab_file, 'rb') as f: ans_to_id, id_to_ans = pickle.loads(f.read()) prediction = id_to_ans[prediction] result += f'VQA Prediction: {prediction}' elif task == 'penn': if text is None: raise Exception( 'No text has been provided. POS tagging cannot proceed.') prediction = model.decode_greedy('PENN', num_steps=len(text.split(' '))) prediction = prediction.argmax(-1).cpu().tolist()[0] with open(penn_vocab_file, 'r') as f: data = json.loads(f.read()) id_to_tag = data['id_to_tag'] penn_text = '' for p in prediction: penn_text = '%s %s' % (penn_text, id_to_tag[str(p)]) result += f'POS tagging Prediction: {penn_text}' latency = time.time() - start print(f'{task} inference took {latency:.2f}') return result, latency