Ejemplo n.º 1
0
    else:
        restore = int(args.restore)
    tasks = tasks.split(',')
    tasks = [t.strip() for t in tasks]
    batch_sizes = batch_sizes.split(',')
    batch_sizes = [int(b.strip()) for b in batch_sizes]

    if len(tasks) != len(batch_sizes):
        raise Exception(
            'Number of tasks provided does not match the number of batch sizes provided.'
        )

    n_gpus = int(args.n_gpus)
    n_tasks = len(tasks) * n_jobs

    shared_model = omninet.OmniNet(gpu_id=0)
    if restore != -1:
        shared_model.restore(model_save_path, restore)
    else:
        restore = 0

    shared_model = shared_model.to(0)
    shared_model.share_memory()
    counters = [Counter(restore) for i in range(len(tasks))]
    barrier = mp.Barrier(n_tasks)
    start = int(restore / n_jobs)
    # Declare training processes for multi-gpu hogwild training
    processes = []
    for i in range(n_tasks):
        #If more than one GPU is used, use first GPU only for model sharing
        if n_gpus > 1:
Ejemplo n.º 2
0
def train(shared_model,
          task,
          batch_size,
          train_steps,
          gpu_id,
          start,
          restore,
          counter,
          barrier=None,
          save_interval=None,
          eval_interval=None,
          log=True):
    log_dir = 'logs/%s' % task
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    if (log == True):
        summary_writer = SummaryWriter(log_dir)
    # Create local model

    torch.manual_seed(int(random.random() * 1000))
    if gpu_id > 0:
        model = omninet.OmniNet(gpu_id=gpu_id)
        model = model.cuda(gpu_id)
    else:
        #For GPU 0, use the shared model always
        model = shared_model

    if task == 'caption':
        DL, val_dl = dl.coco_cap_batchgen(caption_dir=caption_dir,
                                          image_dir=coco_images,
                                          num_workers=8,
                                          batch_size=batch_size)

        optimizer = ScheduledOptim(Adam(filter(lambda x: x.requires_grad,
                                               shared_model.parameters()),
                                        betas=(0.9, 0.98),
                                        eps=1e-09),
                                   512,
                                   16000,
                                   restore,
                                   init_lr=0.02)
    elif task == 'vqa':
        DL, val_dl = dl.vqa_batchgen(vqa_dir,
                                     coco_images,
                                     num_workers=8,
                                     batch_size=batch_size)
        optimizer = ScheduledOptim(Adam(filter(lambda x: x.requires_grad,
                                               shared_model.parameters()),
                                        betas=(0.9, 0.98),
                                        eps=1e-09),
                                   512,
                                   16000,
                                   restore,
                                   max_lr=0.0001,
                                   init_lr=0.02)
    elif task == 'hmdb':
        DL, val_dl = dl.hmdb_batchgen(hmdb_data_dir,
                                      hmdb_process_dir,
                                      num_workers=8,
                                      batch_size=batch_size,
                                      test_batch_size=int(batch_size / 4),
                                      clip_len=16)
        optimizer = ScheduledOptim(Adam(filter(lambda x: x.requires_grad,
                                               shared_model.parameters()),
                                        betas=(0.9, 0.98),
                                        eps=1e-09),
                                   512,
                                   16000,
                                   restore,
                                   max_lr=0.0001,
                                   init_lr=0.02)
    elif task == 'penn':
        DL, val_dl, test_dl = dl.penn_dataloader(
            penn_data_dir,
            batch_size=batch_size,
            test_batch_size=int(batch_size / 2),
            num_workers=4,
            vocab_file='conf/penn_vocab.json')
        optimizer = ScheduledOptim(Adam(filter(lambda x: x.requires_grad,
                                               shared_model.parameters()),
                                        betas=(0.9, 0.98),
                                        eps=1e-09),
                                   512,
                                   16000,
                                   restore,
                                   init_lr=0.02)

    model = model.train()

    for i in range(start, train_steps):
        model.zero_grad()
        if barrier is not None:
            barrier.wait()
        if gpu_id > 0:
            with torch.cuda.device(gpu_id):
                model.load_state_dict(shared_model.state_dict())

        # Calculate loss
        step = counter.increment()
        if task == 'caption':
            if (log and eval_interval is not None and i % eval_interval == 0):
                model = model.eval()
                val_loss = 0
                val_acc = 0
                print('-' * 100)
                print('Evaluation step')
                for b in tqdm(val_dl):
                    imgs = b['img']
                    if gpu_id >= 0:
                        imgs = imgs.cuda(device=gpu_id)
                    captions = b['cap']
                    # In val mode we do not pass the targets for prediction. We use it only for loss calculation
                    _, loss, acc = r.image_caption(model,
                                                   imgs,
                                                   targets=captions,
                                                   mode='val',
                                                   return_str_preds=True)
                    val_loss += float(loss.detach().cpu().numpy())
                    val_acc += acc
                val_loss /= len(val_dl)
                val_acc = (val_acc / len(val_dl))
                summary_writer.add_scalar('Val_loss', val_loss, step)
                print('Step %d, COCO validation loss: %f, Accuracy %f %%' %
                      (step, val_loss, val_acc))
                print('-' * 100)
                model = model.train()
            batch = next(DL)
            if gpu_id >= 0:
                imgs = batch['img'].cuda(device=gpu_id)
            else:
                imgs = batch['img']
            captions = batch['cap']
            _, loss, acc = r.image_caption(model, imgs, targets=captions)
            loss.backward()
            loss = loss.detach()
            if log:
                summary_writer.add_scalar('Loss', loss, step)
            print('Step %d, Caption Loss: %f, Accuracy:  %f %%' %
                  (step, loss, acc))

        elif task == 'vqa':
            if (log and eval_interval is not None and i % eval_interval == 0):
                model = model.eval()
                val_loss = 0
                val_acc = 0
                print('-' * 100)
                print('Evaluation step')
                for b in tqdm(val_dl):
                    imgs = b['img']
                    answers = b['ans']
                    if gpu_id >= 0:
                        imgs = imgs.cuda(device=gpu_id)
                        answers = answers.cuda(device=gpu_id)
                    questions = b['ques']
                    # In val mode we do not pass the targets for prediction. We use it only for loss calculation
                    pred, loss, acc = r.vqa(model,
                                            imgs,
                                            questions,
                                            targets=answers,
                                            mode='val',
                                            return_str_preds=True)
                    val_loss += float(loss.detach().cpu().numpy())
                    val_acc += acc
                val_loss /= len(val_dl)
                val_acc = (val_acc / len(val_dl))
                summary_writer.add_scalar('Val_loss', val_loss, step)
                print('Step %d, VQA validation loss: %f, Accuracy %f %%' %
                      (step, val_loss, val_acc))
                print('-' * 100)
                model = model.train()
                continue
            batch = next(DL)
            if gpu_id >= 0:
                imgs = batch['img'].cuda(device=gpu_id)
                answers = batch['ans'].cuda(device=gpu_id)
            else:
                imgs = batch['img']
                answers = batch['ans']
            questions = batch['ques']
            _, loss, acc = r.vqa(model, imgs, questions, targets=answers)
            loss.backward()
            loss = loss.detach()
            if log:
                summary_writer.add_scalar('Loss', loss, step)
            print('Step %d, VQA Loss: %f, Accuracy:  %f %%' %
                  (step, loss, acc))
        elif task == 'hmdb':
            if (log and eval_interval is not None and i % eval_interval == 0):
                model = model.eval()
                val_loss = 0
                val_acc = 0
                print('-' * 100)
                print('Evaluation step')
                for b in tqdm(val_dl):
                    vid, labels = b
                    if gpu_id >= 0:
                        vid = vid.cuda(device=gpu_id)
                        labels = labels.cuda(device=gpu_id)
                    _, loss, acc = r.hmdb(model,
                                          vid,
                                          targets=labels,
                                          mode='val')
                    val_loss += float(loss.detach().cpu().numpy())
                    val_acc += acc
                val_loss /= len(val_dl)
                val_acc = (val_acc / len(val_dl))
                summary_writer.add_scalar('Val_loss', val_loss, step)
                print('Step %d, HMDB validation loss: %f, Accuracy %f %%' %
                      (step, val_loss, val_acc))
                print('-' * 100)
                model = model.train()
                continue
            vid, labels = next(DL)
            if gpu_id >= 0:
                vid = vid.cuda(device=gpu_id)
                labels = labels.cuda(device=gpu_id)
            _, loss, acc = r.hmdb(model,
                                  vid,
                                  targets=labels,
                                  return_str_preds=True)
            loss.backward()
            loss = loss.detach()
            if log:
                summary_writer.add_scalar('Loss', loss, step)
            print('Step %d, HMDB Loss: %f, Accuracy:  %f %%' %
                  (step, loss, acc))

        elif task == 'penn':
            if (log and eval_interval is not None and i % eval_interval == 0):
                model = model.eval()
                val_loss = 0
                val_acc = 0
                print('-' * 100)
                print('Evaluation step')
                for b in tqdm(test_dl):
                    en = b['text']
                    targets = b['tokens']
                    pad_id = b['pad_id']
                    pad_mask = b['pad_mask']
                    if gpu_id >= 0:
                        targets = targets.to(gpu_id)
                        pad_mask = pad_mask.to(gpu_id)
                    _, loss, acc = r.penn(model,
                                          en,
                                          target_pad_mask=pad_mask,
                                          pad_id=pad_id,
                                          targets=targets,
                                          mode='val',
                                          return_str_preds=True)
                    loss = loss.detach()
                    val_loss += float(loss.cpu().numpy())
                    val_acc += acc
                val_loss /= len(val_dl)
                val_acc = (val_acc / len(val_dl))
                summary_writer.add_scalar('Val_loss', val_loss, step)
                print('Step %d, PENN validation loss: %f, Accuracy %f %%' %
                      (step, val_loss, val_acc))
                print('-' * 100)
                model = model.train()
            batch = next(DL)
            en = batch['text']
            targets = batch['tokens']
            pad_id = batch['pad_id']
            pad_mask = batch['pad_mask']
            if gpu_id >= 0:
                targets = targets.to(gpu_id)
                pad_mask = pad_mask.to(gpu_id)
            _, loss, acc = r.penn(model,
                                  en,
                                  pad_id=pad_id,
                                  targets=targets,
                                  target_pad_mask=pad_mask)
            loss.backward()
            loss = loss.detach()
            if log:
                summary_writer.add_scalar('Loss', loss, step)
            print('Step %d, PENN Loss: %f, Accuracy:  %f %%' %
                  (step, loss, acc))

        # End Calculate loss
        if gpu_id > 0:
            ensure_shared_grads(model, shared_model, gpu_id)
        optimizer.step()
        # Save model
        if (save_interval != None and (i + 1) % save_interval == 0):
            shared_model.save(model_save_path, step)
        sys.stdout.flush()
Ejemplo n.º 3
0
def vision_and_language_prediction(cfg,
                                   task,
                                   image=None,
                                   text=None,
                                   video=None):

    model_file = cfg.OMNINET.MODEL
    verbose = cfg.OMNINET.VERBOSE
    penn_vocab_file = os.path.join(cfg.OMNINET.BASE, 'conf/penn_vocab.json')
    vqa_vocab_file = os.path.join(cfg.OMNINET.BASE, 'conf/vqa_vocab.pkl')
    hmdb_labels_file = os.path.join(cfg.OMNINET.BASE, 'conf/hmdblabels.txt')

    # if verbose==False:
    #     sys.stdout = open(os.devnull, 'w')

    #Load Omninet model
    model = omninet.OmniNet(gpu_id=0)
    model.restore_file(model_file)
    model = model.to(0)
    model = model.eval()
    model.reset(1)

    if image is not None:
        image = extract_pixels_from_image(image)
        print(f'Image encoding input tensor shape is {image.size()}')
        print(
            f'Image encoding input tensor size is {get_tensor_size(image_encodings):.3f}'
        )
        image = image.to(0)

        image_start = time.time()
        model.encode_images(image)
        print(f'Encode image took {time.time() - image_start:.2f}')

    if text is not None:
        text_start = time.time()
        model.encode_englishtexts([text])
        print(f'Encode text took {time.time() - text_start:.2f}')

    if video is not None:
        video = extract_frames_from_video(video, cfg.OMNINET.EXTRACT_FREQUENCY,
                                          cfg.OMNINET.VIDEO_RESIZE_HEIGHT,
                                          cfg.OMNINET.VIDEO_RESIZE_WIDTH,
                                          cfg.OMNINET.CROP_SIZE,
                                          cfg.OMNINET.CLIP_LEN)
        # print(f'Video encoding input tensor shape is {video.size()}')
        print(
            f'Video encoding input tensor size is {get_tensor_size(video):.3f}'
        )
        video = video.to(0)

        video_start = time.time()
        model.encode_videos(video)
        print(f'Encode videos took {time.time() - video_start:.2f}')

    # if verbose == False:
    #     sys.setdout = sys.__stdout__

    result = ""
    start = time.time()
    if task == 'caption':
        prediction = model.decode_greedy('IMAGE_CAPTION', num_steps=100)
        prediction = prediction.argmax(-1)
        prediction = model.english_language_perph.decode_tokens(prediction)
        result += f'Caption Prediction: {prediction[0]}'

    elif task == 'hmdb':
        prediction = model.decode_greedy('HMDB', num_steps=1)
        prediction = prediction.argmax(-1).cpu().tolist()[0][0]
        with open(hmdb_labels_file, 'r') as f:
            lines = f.readlines()
        id_to_label = dict()
        for l in lines:
            id, label = l.split(' ')
            id_to_label[id] = label
        prediction = id_to_label[str(prediction)]
        result += f'Action recognition prediction: {prediction}'

    elif task == 'vqa':
        prediction = model.decode_greedy('VQA', num_steps=1)
        prediction = prediction.argmax(-1).cpu().tolist()[0][0]
        with open(vqa_vocab_file, 'rb') as f:
            ans_to_id, id_to_ans = pickle.loads(f.read())
        prediction = id_to_ans[prediction]
        result += f'VQA Prediction: {prediction}'

    elif task == 'penn':
        if text is None:
            raise Exception(
                'No text has been provided. POS tagging cannot proceed.')
        prediction = model.decode_greedy('PENN',
                                         num_steps=len(text.split(' ')))
        prediction = prediction.argmax(-1).cpu().tolist()[0]
        with open(penn_vocab_file, 'r') as f:
            data = json.loads(f.read())
        id_to_tag = data['id_to_tag']
        penn_text = ''
        for p in prediction:
            penn_text = '%s %s' % (penn_text, id_to_tag[str(p)])
            result += f'POS tagging Prediction: {penn_text}'
    latency = time.time() - start
    print(f'{task} inference took {latency:.2f}')

    return result, latency