Exemple #1
0
def init_model(args):
    print('load vocab from' + args.vocab)
    vocab = torch.load(args.vocab)

    if args.share_vocab:
        if args.model == 'CrosslingualBase':
            print('using CrosslingualBase model')
            model = CrosslingualBase(args, vocab)
        elif args.model == 'CrosslingualConv':
            print('using CrosslingualConv model')
            model = CrosslingualConv(args, vocab)
        else:
            print('model not exits')
            exit(0)
    else:
        if args.model == 'Baseline':
            print('using Baseline model')
            model = Baseline(args, vocab)
        else:
            print('model not exits')
            exit(0)

    # initialize model
    if args.uniform_init:
        model.uniform_init(args.uniform_init)

    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, betas=(0.9, 0.999), eps=1e-8, weight_decay=args.weight_decay, amsgrad=True)
    #optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=0.9, nesterov=True)

    eval_loss_func = train_loss_func = nn.CrossEntropyLoss(weight=torch.FloatTensor([3.0, 1.0]))
    if args.cuda:
        model = model.cuda()
        eval_loss_func = eval_loss_func.cuda()
        train_loss_func = train_loss_func.cuda()
    return model, vocab, optimizer, eval_loss_func, train_loss_func
def main_merge():
    global args, best_corr

    args.store_name = '{}_merged'.format(args.model)
    args.store_name = args.store_name + datetime.now().strftime('_%m-%d_%H-%M')
    args.start_epoch = 0

    check_rootfolders(args)

    model = Baseline(args.img_feat_size, args.au_feat_size)

    model = torch.nn.DataParallel(model).cuda()
    optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate)
    if args.use_multistep:
        scheduler = torch.optim.lr_scheduler.MultiStepLR(
            optimizer, args.step_milestones, args.step_decay)
    # ckpt structure {epoch, state_dict, optimizer, best_corr}
    if args.resume and os.path.isfile(args.resume):
        print('Load checkpoint:', args.resume)
        ckpt = torch.load(args.resume)
        args.start_epoch = ckpt['epoch']
        best_corr = ckpt['best_corr']
        model.load_state_dict(ckpt['state_dict'])
        optimizer.load_state_dict(ckpt['optimizer'])
        print('Loaded ckpt at epoch:', args.start_epoch)

    # initialize datasets
    train_loader = torch.utils.data.DataLoader(dataset=EEV_Dataset(
        csv_path=[args.train_csv, args.val_csv],
        vidmap_path=[args.train_vidmap, args.val_vidmap],
        image_feat_path=args.image_features,
        audio_feat_path=args.audio_features,
        mode='merge'),
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               num_workers=args.workers,
                                               pin_memory=False,
                                               drop_last=True)

    log_training = open(
        os.path.join(args.root_log, args.store_name, 'log.csv'), 'w')
    with open(os.path.join(args.root_log, args.store_name, 'args.txt'),
              'w') as f:
        f.write(str(args))

    tb_writer = SummaryWriter(
        log_dir=os.path.join(args.root_log, args.store_name))
    for epoch in range(args.start_epoch, args.epochs):
        train(train_loader, model, optimizer, epoch, log_training, tb_writer)
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'state_dict': model.state_dict(),
                'optimizer': optimizer.state_dict(),
                'best_corr': 0.0,
            }, False)
        if args.use_multistep:
            scheduler.step()
Exemple #3
0
def main():
    args = config()
    # args, inner variable
    device = args.device
    batch_size = args.batch_size
    num_workers = args.num_worker
    torch.backends.cudnn.benchmark = True

    # data loader - already SignalDataset to cuda
    # dataset : dictionary train, dev, test
    datasets = {}
    dataloaders = {}

    for k in ['train', 'eval', 'test']:
        datasets[k] = SignalDataset(k, args.data_dir)
        dataloaders[k] = DataLoader(datasets[k],
                                    args.batch_size,
                                    shuffle=True,
                                    num_workers=4)
        if k == 'test':
            dataloaders[k] = DataLoader(datasets[k],
                                        args.batch_size,
                                        shuffle=False,
                                        num_workers=4)

    # model load

    if args.ngpu > 1:
        print(f"Model Build....{args.model}")
        model = args.model().to(device)
        torch.nn.DataParallel(model)
    else:
        print(f"Model Build....{args.model}")
        model = Baseline().to(device)

    # criterion

    criterion = nn.BCEWithLogitsLoss()
    # criterion = nn.MSELoss()

    # optimizer
    # adam default => le =1e-3 , betas : 0.9, 0.999 eps=1e-8, weight decay=0
    params = split_weight(model)
    #optimizer = optim.Adam(params)
    optimizer = optim.Adamax(params, lr=args.lr)
    # scheduler
    scheduler = lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)

    # Train
    best_model = train.train(dataloaders, model, criterion, optimizer,
                             scheduler, args)

    # Test
    #test_loss, test_pred = test(dataloaders, model, criterion, optimizer, scheduler, args)

    if not os.path.exists(args.save_dir):
        os.mkdir(args.save_dir)
Exemple #4
0
def main():
    net = Baseline(num_classes=culane.num_classes, deep_base=args['deep_base']).cuda()

    print('load checkpoint \'%s.pth\' for evaluation' % args['checkpoint'])
    pretrained_dict = torch.load(os.path.join(ckpt_path, exp_name, args['checkpoint'] + '_checkpoint.pth'))
    pretrained_dict = {k[7:]: v for k, v in pretrained_dict.items()}
    net.load_state_dict(pretrained_dict)

    net.eval()

    save_dir = os.path.join(ckpt_path, exp_name, 'vis_%s_test' % args['checkpoint'])
    check_mkdir(save_dir)
    log_path = os.path.join(save_dir, str(datetime.datetime.now()) + '.log')

    data_list = [l.strip('\n') for l in open(os.path.join(culane.root, culane.list, 'test_gt.txt'), 'r')]

    loss_record = AverageMeter()
    gt_all, prediction_all=[], []

    for idx in range(len(data_list)):
        print('evaluating %d / %d' % (idx + 1, len(data_list)))

        img = Image.open(culane.root + data_list[idx].split(' ')[0]).convert('RGB')
        gt = Image.open(culane.root + data_list[idx].split(' ')[1])

        img, gt = val_joint_transform(img, gt)

        with torch.no_grad():
            img_var = Variable(img_transform(img).unsqueeze(0)).cuda()
            gt_var = Variable(mask_transform(gt).unsqueeze(0)).cuda()

            prediction = net(img_var)[0]

            loss = criterion(prediction, gt_var)
            loss_record.update(loss.data, 1)

            scoremap = F.softmax(prediction, dim=1).data.squeeze().cpu().numpy()

            prediction = prediction.data.max(1)[1].squeeze().cpu().numpy().astype(np.uint8)
            prediction_all.append(prediction)
            gt_all.append(np.array(gt))

        if args['save_results']:
            check_mkdir(save_dir + data_list[idx].split(' ')[0][:-10])
            out_file = open(os.path.join(save_dir, data_list[idx].split(' ')[0][1:-4] + '.lines.txt'), 'w')
            prob2lines(scoremap, out_file)

    acc, acc_cls, mean_iu, fwavacc = evaluation(prediction_all, gt_all, culane.num_classes)
    log = 'val results: loss %.5f  acc %.5f  acc_cls %.5f  mean_iu %.5f  fwavacc %.5f' % \
              (loss_record.avg, acc, acc_cls, mean_iu, fwavacc)
    print(log)
    open(log_path, 'w').write(log + '\n')
Exemple #5
0
 def __init__(self, key, seq_length=1):
     super().__init__(key, seq_length)
     self.key = str(key)
     self.train_buffer = []
     self.exec_buffer = []
     self.model_name = 'svm'
     self.model = Baseline(self.model_name)
     self.model_path = os.path.join('model_{}_baseline'.format(seq_length),
                                    self.key)
     self.eval_path = os.path.join(
         'evaluation_{}_baseline'.format(seq_length),
         self.key + '_{}_test.csv'.format(self.model_name))
     if self.model.exist(self.model_path):
         print('Using existing model: {}'.format(self.key))
         self.model.load(self.model_path)
Exemple #6
0
def test(args):
    device = torch.device('cuda' if args.cuda else 'cpu')

    pprint(args.__dict__)
    interface = FileInterface(**args.__dict__)
    piqa_model = Baseline(**args.__dict__).to(device)

    processor = SquadProcessor(args.char_vocab_size,
                               args.glove_vocab_size,
                               args.word_vocab_size,
                               elmo=args.elmo)

    bind_model(interface, processor, piqa_model)
    interface.load(args.iteration, session=args.load_dir)

    test_examples = load_squad(interface.test_path, draft=args.draft)
    test_dataset = tuple(
        processor.preprocess(example) for example in test_examples)

    test_sampler = SquadSampler(test_dataset, bucket=True)
    test_loader = DataLoader(test_dataset,
                             batch_size=args.batch_size,
                             sampler=test_sampler,
                             collate_fn=processor.collate)

    print('Inferencing')
    with torch.no_grad():
        piqa_model.eval()
        pred = {}
        for batch_idx, (test_batch, _) in enumerate(
                zip(test_loader, range(args.eval_steps))):
            test_batch = {
                key: val.to(device)
                for key, val in test_batch.items()
            }
            model_output = piqa_model(**test_batch)
            results = processor.postprocess_batch(test_dataset, test_batch,
                                                  model_output)
            if batch_idx % args.dump_period == 0:
                dump = get_dump(test_dataset, test_batch, model_output,
                                results)
                interface.dump(batch_idx, dump)
            for result in results:
                pred[result['id']] = result['pred']

            print('[%d/%d]' % (batch_idx + 1, len(test_loader)))
        interface.pred(pred)
Exemple #7
0
def test():
    # Prepare env
    env = create_env()
    h, w, c = env.observation_space.shape

    # Load 5 best models
    device = torch.device("cpu")
    model_dir = "./policy_grad"
    model_fns = {}
    for fn in os.listdir(model_dir):
        if fn.endswith('.pth'):
            score = fn.split("_")[-1][:-4]
            model_fns[fn] = float(score)
    top_5 = heapq.nlargest(3, model_fns, key=model_fns.get)

    models = []
    for fn in top_5:
        path = os.path.join(model_dir, fn)
        model = Baseline(h, w).to(device)
        model.load_state_dict(torch.load(path, map_location='cpu'))
        model.eval()
        models.append(model)

    # Watch race car perform
    state = env.reset().transpose((2, 0, 1))
    state = torch.tensor([state], dtype=torch.float, device=device)
    total_reward = 0
    for t in count():
        # Select and perform an action
        votes = []
        for model in models:
            pi, _ = model(state)
            votes.append(pi.argmax().item())
        action_idx = Counter(votes).most_common(1)[0][0]
        action = index_to_action(action_idx)
        state, reward, done, _ = env.step(action)
        env.render()

        # Update
        state = state.transpose((2, 0, 1))
        state = torch.tensor([state], dtype=torch.float, device=device)
        total_reward += reward
        if done:
            break
    print("Total reward: {}".format(total_reward))
Exemple #8
0
def train():
    # Prepare gym
    env = create_env()
    h, w, c = env.observation_space.shape

    # Prepare models
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model_dir, fn = "./policy_grad", '{}.pth'
    model = Baseline(h, w).to(device)
    model.train()
    optimizer = optim.RMSprop(model.parameters(),
                              lr=LEARN_RATE,
                              weight_decay=WEIGHT_DECAY)

    # Train
    steps_done = 0
    num_episodes = 2000
    episode_rewards = []

    for i_episode in tqdm(range(num_episodes)):
        # Complete 1 episode
        print("Episode {}".format(i_episode + 1))
        i_rewards, i_states, i_actions, steps_done = generate_episode(
            env, model, device, steps_done, episode_rewards)

        # Update model
        optimize_model(device, model, optimizer, i_rewards, i_actions,
                       i_states)

        # Save model every couple episodes
        if (i_episode + 1) % SAVE_EPI == 0:
            path = os.path.join(model_dir, fn.format(episode_rewards[-1]))
            torch.save(model.state_dict(), path)

    print('Complete')
    np.save('./rewards_policy_grad.npy', episode_rewards)

    env.close()
    plt.ioff()
    plt.show()
Exemple #9
0
def main():
    net = Baseline(num_classes=culane.num_classes,
                   deep_base=args['deep_base']).cuda().train()
    net = DataParallelWithCallback(net)

    optimizer = optim.SGD([{
        'params': [
            param
            for name, param in net.named_parameters() if name[-4:] == 'bias'
        ],
        'lr':
        2 * args['base_lr']
    }, {
        'params': [
            param
            for name, param in net.named_parameters() if name[-4:] != 'bias'
        ],
        'lr':
        args['base_lr']
    }],
                          momentum=args['momentum'])

    if len(args['checkpoint']) > 0:
        print('training resumes from \'%s\'' % args['checkpoint'])
        net.load_state_dict(
            torch.load(
                os.path.join(ckpt_path, exp_name,
                             args['checkpoint'] + '_checkpoint.pth')))
        optimizer.load_state_dict(
            torch.load(
                os.path.join(ckpt_path, exp_name,
                             args['checkpoint'] + '_checkpoint_optim.pth')))
        optimizer.param_groups[0]['lr'] = 2 * args['base_lr']
        optimizer.param_groups[1]['lr'] = args['base_lr']

    check_mkdir(os.path.join(ckpt_path, exp_name))
    open(log_path, 'w').write(str(args) + '\n\n')

    train(net, optimizer)
Exemple #10
0
def train(args):
    start_time = time.time()
    device = torch.device('cuda' if args.cuda else 'cpu')

    pprint(args.__dict__)
    interface = FileInterface(**args.__dict__)
    piqa_model = Baseline(**args.__dict__).to(device)

    loss_model = Loss().to(device)
    optimizer = torch.optim.Adam(p for p in piqa_model.parameters()
                                 if p.requires_grad)

    batch_size = args.batch_size
    char_vocab_size = args.char_vocab_size
    glove_vocab_size = args.glove_vocab_size
    word_vocab_size = args.word_vocab_size
    glove_size = args.glove_size
    elmo = args.elmo
    draft = args.draft

    def preprocess(interface_):
        # get data
        print('Loading train and dev data')
        train_examples = load_squad(interface_.train_path, draft=draft)
        dev_examples = load_squad(interface_.test_path, draft=draft)

        # iff creating processor
        print('Loading GloVe')
        glove_words, glove_emb_mat = load_glove(
            glove_size,
            vocab_size=args.glove_vocab_size - 2,
            glove_dir=interface_.glove_dir,
            draft=draft)

        print('Constructing processor')
        processor = SquadProcessor(char_vocab_size,
                                   glove_vocab_size,
                                   word_vocab_size,
                                   elmo=elmo)
        processor.construct(train_examples, glove_words)

        # data loader
        print('Preprocessing datasets')
        train_dataset = tuple(
            processor.preprocess(example) for example in train_examples)
        dev_dataset = tuple(
            processor.preprocess(example) for example in dev_examples)

        print('Creating data loaders')
        train_sampler = SquadSampler(train_dataset,
                                     max_context_size=256,
                                     max_question_size=32,
                                     bucket=True,
                                     shuffle=True)
        train_loader = DataLoader(train_dataset,
                                  batch_size=batch_size,
                                  collate_fn=processor.collate,
                                  sampler=train_sampler)

        dev_sampler = SquadSampler(dev_dataset, bucket=True)
        dev_loader = DataLoader(dev_dataset,
                                batch_size=batch_size,
                                collate_fn=processor.collate,
                                sampler=dev_sampler)

        if args.preload:
            train_loader = tuple(train_loader)
            dev_loader = tuple(dev_loader)

        out = {
            'glove_emb_mat': glove_emb_mat,
            'processor': processor,
            'train_dataset': train_dataset,
            'dev_dataset': dev_dataset,
            'train_loader': train_loader,
            'dev_loader': dev_loader
        }

        return out

    out = interface.cache(
        preprocess,
        interface_=interface) if args.cache else preprocess(interface)
    glove_emb_mat = out['glove_emb_mat']
    processor = out['processor']
    train_dataset = out['train_dataset']
    dev_dataset = out['dev_dataset']
    train_loader = out['train_loader']
    dev_loader = out['dev_loader']

    print("Initializing model weights")
    piqa_model.load_glove(torch.tensor(glove_emb_mat))

    bind_model(interface, processor, piqa_model, optimizer=optimizer)

    step = 0
    best_report = None

    print('Training')
    piqa_model.train()
    for epoch_idx in range(args.epochs):
        for i, train_batch in enumerate(train_loader):
            train_batch = {
                key: val.to(device)
                for key, val in train_batch.items()
            }
            model_output = piqa_model(step=step, **train_batch)
            train_results = processor.postprocess_batch(
                train_dataset, train_batch, model_output)
            train_loss = loss_model(step=step, **model_output, **train_batch)
            train_f1 = float(
                np.mean([result['f1'] for result in train_results]))
            train_em = float(
                np.mean([result['em'] for result in train_results]))

            # optimize
            optimizer.zero_grad()
            train_loss.backward()
            optimizer.step()
            step += 1

            # report & eval & save
            if step % args.report_period == 1:
                report = OrderedDict(step=step,
                                     train_loss=train_loss.item(),
                                     train_f1=train_f1,
                                     train_em=train_em,
                                     time=time.time() - start_time)
                interface.report(**report)
                print(', '.join('%s=%.5r' % (s, r) for s, r in report.items()))

            if step % args.eval_save_period == 1:
                with torch.no_grad():
                    piqa_model.eval()
                    loss_model.eval()
                    pred = {}
                    dev_losses, dev_results = [], []
                    for dev_batch, _ in zip(dev_loader,
                                            range(args.eval_steps)):
                        dev_batch = {
                            key: val.to(device)
                            for key, val in dev_batch.items()
                        }
                        model_output = piqa_model(**dev_batch)
                        results = processor.postprocess_batch(
                            dev_dataset, dev_batch, model_output)

                        dev_loss = loss_model(step=step,
                                              **dev_batch,
                                              **model_output)

                        for result in results:
                            pred[result['id']] = result['pred']
                        dev_results.extend(results)
                        dev_losses.append(dev_loss.item())

                    dev_loss = float(np.mean(dev_losses))
                    dev_f1 = float(
                        np.mean([result['f1'] for result in dev_results]))
                    dev_em = float(
                        np.mean([result['em'] for result in dev_results]))

                    report = OrderedDict(step=step,
                                         dev_loss=dev_loss,
                                         dev_f1=dev_f1,
                                         dev_em=dev_em,
                                         time=time.time() - start_time)
                    summary = False
                    if best_report is None or report['dev_f1'] > best_report[
                            'dev_f1']:
                        best_report = report
                        summary = True
                        interface.save(iteration=step)
                        interface.pred(pred)
                    interface.report(summary=summary, **report)
                    print(
                        ', '.join('%s=%.5r' % (s, r)
                                  for s, r in report.items()),
                        '(dev_f1_best=%.5r @%d)' %
                        (best_report['dev_f1'], best_report['step']))
                    piqa_model.train()
                    loss_model.train()

            if step == args.train_steps:
                break
        if step == args.train_steps:
            break
Exemple #11
0
    def infer(test_image_data_path, test_meta_data_path):
        # DONOTCHANGE This Line
        test_meta_data = pd.read_csv(test_meta_data_path,
                                     delimiter=',',
                                     header=0)

        device = 0

        models = args.models.split(",")
        model_weights = [float(w) for w in args.model_weights.split(",")]
        nsml_sessionss = args.nsml_sessionss.split(",")
        nsml_checkpoints = args.nsml_checkpoints.split(",")
        loss_types = args.loss_types.split(",")

        transform_random_crop = args.transform_random_crop.split(",")
        transform_random_sized_crop = args.transform_random_sized_crop.split(
            ",")
        transform_norm = args.transform_norm.split(",")
        infer_transform_center_crop = args.infer_transform_center_crop.split(
            ",")

        total_output_probs = None
        for i, model_name in enumerate(models):
            batch_size = batch_size_map[model_name] // 2

            infer_transform_list = []

            if infer_transform_center_crop[i] == "True":
                infer_transform_list.append(transforms.Resize((248, 248)))
                infer_transform_list.append(
                    transforms.CenterCrop((args.input_size, args.input_size)))
                infer_transform_list.append(transforms.ToTensor())
                if transform_norm[i] == "True":
                    infer_transform_list.append(
                        transforms.Normalize(
                            [0.44097832, 0.44847423, 0.42528335],
                            [0.25748107, 0.26744914, 0.30532702]))
            else:
                if transform_random_crop[i] == "True":
                    infer_transform_list.append(transforms.Resize((256, 256)))
                    infer_transform_list.append(
                        transforms.CenterCrop(
                            (args.input_size, args.input_size)))
                elif transform_random_sized_crop[i] == "True":
                    infer_transform_list.append(transforms.Resize((256, 256)))
                    infer_transform_list.append(
                        transforms.CenterCrop(
                            (args.input_size, args.input_size)))
                else:
                    infer_transform_list.append(
                        transforms.Resize((args.input_size, args.input_size)))
                infer_transform_list.append(transforms.ToTensor())
                if transform_norm[i] == "True":
                    infer_transform_list.append(
                        transforms.Normalize(
                            [0.44097832, 0.44847423, 0.42528335],
                            [0.25748107, 0.26744914, 0.30532702]))

            print("transform", infer_transform_list)

            dataloader = DataLoader(
                AIRushDataset(
                    test_image_data_path,
                    test_meta_data,
                    label_path=None,
                    transform=transforms.Compose(infer_transform_list)
                ),  #[transforms.Resize((args.input_size, args.input_size)), transforms.ToTensor()])),
                batch_size=batch_size,
                shuffle=False,
                num_workers=0,
                pin_memory=True)

            if model_name == "Resnet18":
                model = Resnet18(args.output_size)
            elif model_name == "Resnet152":
                model = Resnet152(args.output_size)
            elif model_name == "baseline":
                model = Baseline(args.hidden_size, args.output_size)
            elif model_name.split("-")[0] == "efficientnet":
                model = EfficientNet.from_pretrained(args.model,
                                                     args.output_size)
            else:
                raise Exception("model type is invalid : " + args.model)

            model.to(device)

            def load_fn(dir_name):
                save_state_path = os.path.join(dir_name, 'state_dict.pkl')
                state = torch.load(save_state_path)
                model.load_state_dict(state['model'])
                print("model loaded", dir_name)

            model.eval()

            nsml.load(checkpoint=nsml_checkpoints[i],
                      load_fn=load_fn,
                      session="team_13/airush1/" + nsml_sessionss[i])

            output_probs = None
            for batch_idx, image in enumerate(dataloader):
                image = image.to(device)
                output = model(image).double()

                if loss_types[i] == "cross_entropy":
                    output_prob = F.softmax(output, dim=1)
                else:
                    output_prob = torch.sigmoid(output)

                if output_probs is None:
                    output_probs = to_np(output_prob)
                else:
                    output_probs = np.concatenate(
                        [output_probs, to_np(output_prob)], axis=0)
            if total_output_probs is None:
                total_output_probs = output_probs * model_weights[i]
            else:
                total_output_probs += (output_probs * model_weights[i])

        predict = np.argmax(total_output_probs, axis=1)

        return predict  # this return type should be a numpy array which has shape of (138343)
def main_test():
    print('Running test...')
    torch.multiprocessing.set_sharing_strategy('file_system')
    model = Baseline()
    if args.use_swa:
        model = torch.optim.swa_utils.AveragedModel(model)
    model = torch.nn.DataParallel(model).cuda()
    # ckpt structure {epoch, state_dict, optimizer, best_corr}
    if args.resume and os.path.isfile(args.resume):
        print('Load checkpoint:', args.resume)
        ckpt = torch.load(args.resume)
        args.start_epoch = ckpt['epoch']
        best_corr = ckpt['best_corr']
        model.load_state_dict(ckpt['state_dict'])
        print('Loaded ckpt at epoch:', args.start_epoch)
    else:
        print('No model given. Abort!')
        exit(1)

    test_loader = torch.utils.data.DataLoader(
        dataset=EEV_Dataset(
            csv_path=None,
            vidmap_path=args.test_vidmap,
            image_feat_path=args.image_features,
            audio_feat_path=args.audio_features,
            mode='test',
            test_freq=args.test_freq
        ),
        batch_size=None, shuffle=False,
        num_workers=args.workers, pin_memory=False
    )

    model.eval()
    batch_time = AverageMeter()

    t_start = time.time()

    outputs = []
    with torch.no_grad():
        for i, (img_feat, au_feat, frame_count, vid) in enumerate(test_loader):
            img_feat = torch.stack(img_feat).cuda()
            au_feat = torch.stack(au_feat).cuda()
            assert len(au_feat.size()) == 3, 'bad auf %s' % (vid)
            output = model(img_feat, au_feat) # [Clip S 15]
            # rearrange and remove extra padding in the end
            output = rearrange(output, 'Clip S C -> (Clip S) C')
            output = torch.cat([output, output[-1:]]) # repeat the last frame to avoid missing 
            if args.train_freq < args.test_freq:
                # print('interpolating:', output.size()[0], frame_count)
                output = interpolate_output(output, args.train_freq, 6)
            # print('Interpolated:', output.size()[0], frame_count)
            # truncate extra frames
            assert output.size(0) >= frame_count, '{}/{}'.format(output.size(0), frame_count)
            output = output[:frame_count]
            outputs.append((vid, frame_count, output.cpu().detach().numpy()))

            # update statistics
            batch_time.update(time.time() - t_start)
            t_start = time.time()

            if i % args.print_freq == 0:
                output = ('Test: [{0}/{1}]\t'
                          'Time: {batch_time.val:.3f} ({batch_time.avg:.3f})'.format(
                    i, len(test_loader), batch_time=batch_time))
                print(output)
    
    time_stamps = [0, 166666, 333333, 500000, 666666, 833333]
    time_step = 1000000 # time starts at 0
    header = 'Video ID,Timestamp (milliseconds),amusement,anger,awe,concentration,confusion,contempt,contentment,disappointment,doubt,elation,interest,pain,sadness,surprise,triumph\n'
   
    final_res = {}
    for vid, frame_count, out in outputs:# videos
        video_time = frame_count // 6 + 1
        # print('video', vid, video_time)
        entry_count = 0
        for t in range(video_time): # seconds
            for i in range(6): # frames
                timestamp = time_step * t + time_stamps[i]
                fcc = t * 6 + i
                if fcc >= frame_count:
                    continue
                # print('Frame count', frame_count)
                frame_output = out[fcc]
                frame_output = [str(x) for x in frame_output]
                temp = '{vid},{timestamp},'.format(vid=vid,timestamp=timestamp) + ','.join(frame_output) + '\n'
                # file.write(temp)
                if vid in final_res:
                    final_res[vid].append(temp)
                else:
                    final_res[vid] = [temp]
                entry_count += 1
        assert entry_count == frame_count
    # fixed for now
    missing = [('WKXrnB7alT8', 2919), ('o0ooW14pIa4', 3733), ('GufMoL_MuNE',2038), ('Uee0Tv1rTz8', 1316), ('ScvvOWtb04Q', 152), ('R9kJlLungmo', 3609),('QMW3GuohzzE', 822), ('fjJYTW2n6rk', 4108), ('rbTIMt0VcLw', 1084),('L9cdaj74kLo', 3678), ('l-ka23gU4NA', 1759)]
    for vid, length in missing:
        video_time = length // 6 + 1
        # print('video', vid, video_time)
        for t in range(video_time): # seconds
            for i in range(6): # frames
                timestamp = time_step * t + time_stamps[i]
                fcc = t * 6 + i
                if fcc >= length:
                    continue
                frame_output = ',0'*15
                temp = '{vid},{timestamp}'.format(vid=vid, timestamp=timestamp) + frame_output + '\n'
                # file.write(temp)
                if vid in final_res:
                    final_res[vid].append(temp)
                else:
                    final_res[vid] = [temp]
    print('Write test outputs...')
    with open('test_output.csv', 'w') as file:
        file.write(header)
        temp_vidmap = [x.strip().split(' ') for x in open(args.test_vidmap)]
        temp_vidmap = [x[0] for x in temp_vidmap]
        for vid in tqdm(temp_vidmap):
            for entry in final_res[vid]:
                file.write(entry)
Exemple #13
0
# model_name = 'se_resnet50'
# model_path = 'se_resnet50-ce0d4300.pth'

model_name = 'AlexNet'
model_path = 'model_mobilefacenet.pth'

# model_name = 'MiniXception'
# model_path = ' '

# model_name = 'ConvNet'
# model_path = ' '

# model_name = 'MixNet'
# model_path = ' '

model = Baseline(model='train', model_name=model_name, model_path=model_path)
#model.load_param('models/model_1_180000.pth')
model = model.cuda()

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
#exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)

# kd_id = 0
# kd_num = 7
# batch_size = 48
# instance_num = 1
train_data, val_data, trains, vals = make_dataloader(kd_id, kd_num)
train_loader = DataLoader(dataset=train_data,
                          batch_size=batch_size,
                          sampler=RandomSampler(trains, batch_size,
                                                instance_num),
Exemple #14
0
    '2_3', '2_4', '2_5', '2_6', '2_7', '3_1', '3_2', '3_3', '3_4', '3_5',
    '3_6', '3_7', '4_1', '4_2', '4_3', '4_4', '4_5', '4_6', '4_7', '5_1',
    '5_2', '5_3', '5_4', '5_5', '5_6', '5_7', '6_1', '6_2', '6_3', '6_4',
    '6_5', '6_6', '6_7', '7_1', '7_2', '7_3', '7_4', '7_5', '7_6', '7_7'
]

transform_test = T.Compose([
    T.Resize([224, 224]),
    T.ToTensor(),
    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

model_name = 'AlexNet'
model_path = './models/1_AlexNet_0.18444_14.pth'

model = Baseline(model='test', model_name=model_name)
model.load_param(model_path)
model = model.cuda()
model = model.eval()

records = open('./faces_224/anns/val_ld.txt').read().strip().split('\n')

result_file = open("predictions.txt", 'w')
with torch.no_grad():
    for rec in records:
        rec = rec.strip('\n').split()
        img_path = rec[0]

        landmark = rec[1:]
        landmark = np.array(list(map(float, landmark)), dtype=np.float32)
        landmark = torch.tensor(landmark, dtype=torch.float32).unsqueeze(0)
def main_train():
    global args, best_corr

    args.store_name = '{}'.format(args.model)
    args.store_name = args.store_name + datetime.now().strftime(
        '_%m-%d_%H-%M-%S')
    args.start_epoch = 0

    if not args.val_only:
        check_rootfolders(args)
    if args.model == 'Baseline':
        if args.cls_indices:
            model = Baseline(args.img_feat_size,
                             args.au_feat_size,
                             num_classes=len(args.cls_indices))
        else:
            print('Feature size:', args.img_feat_size, args.au_feat_size)
            model = Baseline(args.img_feat_size, args.au_feat_size)
    elif args.model == 'TCFPN':
        model = TCFPN(layers=[48, 64, 96],
                      in_channels=(128),
                      num_classes=15,
                      kernel_size=11)
    elif args.model == 'BaseAu':
        model = Baseline_Au(args.au_feat_size)
    elif args.model == 'BaseImg':
        model = Baseline_Img(args.img_feat_size)
    elif args.model == 'EmoBase':
        model = EmoBase()

    model = torch.nn.DataParallel(model).cuda()

    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=args.learning_rate,
                                 weight_decay=args.weight_decay)
    # optimizer = torch.optim.AdamW(model.parameters(), lr=args.learning_rate)
    # custom optimizer
    if args.use_sam:
        base_optim = torch.optim.Adam
        optimizer = SAM(model.parameters(), base_optim, lr=args.learning_rate)
    # custom lr scheduler
    if args.use_cos_wr:
        scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
            optimizer, T_0=args.cos_wr_t0, T_mult=args.cos_wr_t_mult)
    elif args.use_cos:
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
            optimizer, args.cos_t_max)
    elif args.use_multistep:
        scheduler = torch.optim.lr_scheduler.MultiStepLR(
            optimizer, args.step_milestones, args.step_decay)
    # SWA
    if args.use_swa:
        swa_model = torch.optim.swa_utils.AveragedModel(model)
        swa_scheduler = torch.optim.swa_utils.SWALR(optimizer,
                                                    swa_lr=args.learning_rate)

    # ckpt structure {epoch, state_dict, optimizer, best_corr}
    if args.resume and os.path.isfile(args.resume):
        print('Load checkpoint:', args.resume)
        ckpt = torch.load(args.resume)
        args.start_epoch = ckpt['epoch']
        best_corr = ckpt['best_corr']
        model.load_state_dict(ckpt['state_dict'])
        optimizer.load_state_dict(ckpt['optimizer'])
        print('Loaded ckpt at epoch:', args.start_epoch)

    # initialize datasets
    train_loader = torch.utils.data.DataLoader(dataset=EEV_Dataset(
        csv_path=args.train_csv,
        vidmap_path=args.train_vidmap,
        image_feat_path=args.image_features,
        audio_feat_path=args.audio_features,
        mode='train',
        lpfilter=args.lp_filter,
        train_freq=args.train_freq,
        val_freq=args.val_freq,
        cls_indices=args.cls_indices),
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               num_workers=args.workers,
                                               pin_memory=False,
                                               drop_last=True)

    val_loader = torch.utils.data.DataLoader(dataset=EEV_Dataset(
        csv_path=args.val_csv,
        vidmap_path=args.val_vidmap,
        image_feat_path=args.image_features,
        audio_feat_path=args.audio_features,
        mode='val',
        train_freq=args.train_freq,
        val_freq=args.val_freq,
        cls_indices=args.cls_indices,
        repeat_sample=args.repeat_sample),
                                             batch_size=None,
                                             shuffle=False,
                                             num_workers=args.workers,
                                             pin_memory=False)

    accuracy = correlation

    if args.val_only:
        print('Run validation ...')
        print('start epoch:', args.start_epoch, 'model:', args.resume)
        validate(val_loader, model, accuracy, args.start_epoch, None, None)
        return

    log_training = open(
        os.path.join(args.root_log, args.store_name, 'log.csv'), 'w')
    with open(os.path.join(args.root_log, args.store_name, 'args.txt'),
              'w') as f:
        f.write(str(args))

    tb_writer = SummaryWriter(
        log_dir=os.path.join(args.root_log, args.store_name))
    for epoch in range(args.start_epoch, args.epochs):
        train(train_loader, model, optimizer, epoch, log_training, tb_writer)
        # do lr scheduling after epoch
        if args.use_swa and epoch >= args.swa_start:
            print('swa stepping...')
            swa_model.update_parameters(model)
            swa_scheduler.step()
        elif args.use_cos_wr or args.use_cos or args.use_multistep:
            scheduler.step()

        if (epoch + 1) > 2 and ((epoch + 1) % args.eval_freq == 0 or
                                (epoch + 1) == args.epochs):
            # validate
            if args.use_swa and epoch >= args.swa_start:
                # validate use swa model
                corr = validate(val_loader, swa_model, accuracy, epoch,
                                log_training, tb_writer)
            else:
                corr = validate(val_loader, model, accuracy, epoch,
                                log_training, tb_writer)
            is_best = corr > best_corr
            best_corr = max(corr, best_corr)
            tb_writer.add_scalar('acc/validate_corr_best', best_corr, epoch)
            output_best = 'Best corr: %.4f\n' % (best_corr)
            print(output_best)
            log_training.write(output_best + '\n')
            log_training.flush()

            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'state_dict': model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'best_corr': best_corr,
                }, is_best)
Exemple #16
0
    SEED = 2019
    seed_everything(SEED)

    device = args.device
    use_gpu = cuda.is_available()

    if use_gpu:
        print("enable gpu use")
    else:
        print("enable cpu for debugging")

    target_size = (args.input_size, args.input_size)

    if args.model == 'base':
        assert args.input_size == 128
        model = Baseline(args.hidden_size, args.num_classes)
    elif args.model == 'resnet18':
        model = Resnet18(args.num_classes, dropout=False)
    elif args.model == 'resnet50':
        model = Resnet50(args.num_classes, dropout=False)
    elif args.model == 'efficient':
        model = EfficientNet.from_pretrained('efficientnet-b0')
        in_features = model._fc.in_features
        model._fc = nn.Linear(in_features, args.num_classes)
    elif args.model == 'densenet201':
        model = models.densenet201(pretrained=True)
        model.classifier = nn.Linear(1920, args.num_classes)
    elif args.model == 'resnext50':
        model = Resnext50(args.num_classes, dropout=False)
    elif args.model == 'resnext101':
        model = Resnext101(args.num_classes, dropout=False)
Exemple #17
0
def prepare(args):
    resume_from_checkpoint = args.resume_from_checkpoint

    prepare_start_time = time.time()
    logger.info('global', 'Start preparing.')
    check_config_dir()
    logger.info('setting', config_info(), time_report=False)

    model = Baseline(num_classes=Config.nr_class)
    logger.info('setting', model_summary(model), time_report=False)
    logger.info('setting', str(model), time_report=False)

    train_transforms = transforms.Compose([
        transforms.Resize(Config.input_shape),
        transforms.RandomApply([
            transforms.ColorJitter(
                brightness=0.3, contrast=0.3, saturation=0.3, hue=0)
        ],
                               p=0.5),
        transforms.RandomHorizontalFlip(),
        transforms.Pad(10),
        transforms.RandomCrop(Config.input_shape),
        transforms.ToTensor(),
        transforms.RandomErasing(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ])

    test_transforms = transforms.Compose([
        transforms.Resize(Config.input_shape),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ])

    trainset = Veri776_train(transforms=train_transforms, need_attr=True)
    testset = Veri776_test(transforms=test_transforms, need_attr=True)

    pksampler = PKSampler(trainset, p=Config.P, k=Config.K)
    train_loader = torch.utils.data.DataLoader(trainset,
                                               batch_size=Config.batch_size,
                                               sampler=pksampler,
                                               num_workers=Config.nr_worker,
                                               pin_memory=True)
    test_loader = torch.utils.data.DataLoader(
        testset,
        batch_size=Config.batch_size,
        sampler=torch.utils.data.SequentialSampler(testset),
        num_workers=Config.nr_worker,
        pin_memory=True)

    weight_decay_setting = parm_list_with_Wdecay(model)
    optimizer = torch.optim.Adam(weight_decay_setting, lr=Config.lr)
    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer,
                                                  lr_lambda=lr_multi_func)

    losses = {}
    losses['cross_entropy_loss'] = torch.nn.CrossEntropyLoss()
    losses['type_ce_loss'] = torch.nn.CrossEntropyLoss()
    losses['color_ce_loss'] = torch.nn.CrossEntropyLoss()
    losses['triplet_hard_loss'] = triplet_hard_loss(
        margin=Config.triplet_margin)

    for k in losses.keys():
        losses[k] = losses[k].cuda()

    start_epoch = 0
    if resume_from_checkpoint and os.path.exists(Config.checkpoint_path):
        checkpoint = load_checkpoint()
        start_epoch = checkpoint['epoch']
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        scheduler.load_state_dict(checkpoint['scheduler'])

    # continue training for next the epoch of the checkpoint, or simply start from 1
    start_epoch += 1

    ret = {
        'start_epoch': start_epoch,
        'model': model,
        'train_loader': train_loader,
        'test_loader': test_loader,
        'optimizer': optimizer,
        'scheduler': scheduler,
        'losses': losses
    }

    prepare_end_time = time.time()
    time_spent = sec2min_sec(prepare_start_time, prepare_end_time)
    logger.info(
        'global', 'Finish preparing, time spend: {}mins {}s.'.format(
            time_spent[0], time_spent[1]))

    return ret
Exemple #18
0
def embed(args):
    device = torch.device('cuda' if args.cuda else 'cpu')

    pprint(args.__dict__)
    interface = FileInterface(**args.__dict__)
    piqa_model = Baseline(**args.__dict__).to(device)

    processor = SquadProcessor(args.char_vocab_size,
                               args.glove_vocab_size,
                               args.word_vocab_size,
                               elmo=args.elmo)

    bind_model(interface, processor, piqa_model)
    interface.load(args.iteration, session=args.load_dir)

    test_examples = load_squad(interface.test_path, draft=args.draft)
    test_dataset = tuple(
        processor.preprocess(example) for example in test_examples)

    test_sampler = SquadSampler(test_dataset, bucket=True)
    test_loader = DataLoader(test_dataset,
                             batch_size=args.batch_size,
                             sampler=test_sampler,
                             collate_fn=processor.collate)

    print('Saving embeddings')
    with torch.no_grad():
        piqa_model.eval()
        for batch_idx, (test_batch, _) in enumerate(
                zip(test_loader, range(args.eval_steps))):
            test_batch = {
                key: val.to(device)
                for key, val in test_batch.items()
            }

            if args.mode == 'embed' or args.mode == 'embed_context':

                context_output = piqa_model.get_context(**test_batch)
                context_results = processor.postprocess_context_batch(
                    test_dataset,
                    test_batch,
                    context_output,
                    emb_type=args.emb_type)

                for id_, phrases, matrix in context_results:
                    interface.context_emb(id_,
                                          phrases,
                                          matrix,
                                          emb_type=args.emb_type)

            if args.mode == 'embed' or args.mode == 'embed_question':

                question_output = piqa_model.get_question(**test_batch)
                question_results = processor.postprocess_question_batch(
                    test_dataset,
                    test_batch,
                    question_output,
                    emb_type=args.emb_type)

                for id_, emb in question_results:
                    interface.question_emb(id_, emb, emb_type=args.emb_type)

            print('[%d/%d]' % (batch_idx + 1, len(test_loader)))
def main_train(config, checkpoint_dir=None):
    global args, best_corr
    best_corr = 0.0

    args.store_name = '{}'.format(args.model)
    args.store_name = args.store_name + datetime.now().strftime('_%m-%d_%H-%M-%S')
    args.start_epoch = 0

    # check_rootfolders(args)
    if args.model == 'Baseline':
        model = Baseline()
    elif args.model == 'TCFPN':
        model = TCFPN(layers=[48, 64, 96], in_channels=(2048 + 128), num_classes=15, kernel_size=11)
    
    model = torch.nn.DataParallel(model).cuda()

    if config['optimizer'] == 'adam':
        optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate)
    elif config['optimizer'] == 'adamw':
        optimizer = torch.optim.AdamW(model.parameters(), lr=config['lr'])
    
    # custom optimizer
    if args.use_sam:
        base_optim = torch.optim.Adam
        optimizer = SAM(model.parameters(), base_optim, lr=config['lr'])
    # custom lr scheduler
    if args.use_cos_wr:
        scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=args.cos_wr_t0,T_mult=args.cos_wr_t_mult)
    elif args.use_cos:
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.cos_t_max)
    # SWA
    if args.use_swa:
        swa_model = torch.optim.swa_utils.AveragedModel(model)
        swa_scheduler = torch.optim.swa_utils.SWALR(optimizer, swa_lr=config['lr'])

    # ckpt structure {epoch, state_dict, optimizer, best_corr}
    # if args.resume and os.path.isfile(args.resume):
    #     print('Load checkpoint:', args.resume)
    #     ckpt = torch.load(args.resume)
    #     args.start_epoch = ckpt['epoch']
    #     best_corr = ckpt['best_corr']
    #     model.load_state_dict(ckpt['state_dict'])
    #     optimizer.load_state_dict(ckpt['optimizer'])
    #     print('Loaded ckpt at epoch:', args.start_epoch)
    if checkpoint_dir:
        model_state, optimizer_state = torch.load(
            os.path.join(checkpoint_dir, "checkpoint"))
        model.load_state_dict(model_state)
        optimizer.load_state_dict(optimizer_state)


    # initialize datasets
    train_loader = torch.utils.data.DataLoader(
        dataset=EEV_Dataset(
            csv_path=args.train_csv,
            vidmap_path=args.train_vidmap,
            image_feat_path=args.image_features,
            audio_feat_path=args.audio_features,
            mode='train', lpfilter=args.lp_filter
        ),
        batch_size=config['batch_size'], shuffle=True,
        num_workers=args.workers, pin_memory=False,
        drop_last=True
    )

    val_loader = torch.utils.data.DataLoader(
        dataset=EEV_Dataset(
            csv_path=args.val_csv,
            vidmap_path=args.val_vidmap,
            image_feat_path=args.image_features,
            audio_feat_path=args.audio_features,
            mode='val'
        ),
        batch_size=None, shuffle=False,
        num_workers=args.workers, pin_memory=False
    )

    accuracy = correlation
    # with open(os.path.join(args.root_log, args.store_name, 'args.txt'), 'w') as f:
    #     f.write(str(args))
    
    # tb_writer = SummaryWriter(log_dir=os.path.join(args.root_log, args.store_name))

    for epoch in range(args.start_epoch, args.epochs):
        # train
        train(train_loader, model, optimizer, epoch, None, None)
        # do lr scheduling after epoch
        if args.use_swa and epoch >= args.swa_start:
            print('swa stepping...')
            swa_model.update_parameters(model)
            swa_scheduler.step()
        elif args.use_cos_wr:
            print('cos warm restart (T0:{} Tm:{}) stepping...'.format(args.cos_wr_t0, args.cos_wr_t_mult))
            scheduler.step()
        elif args.use_cos:
            print('cos (Tmax:{}) stepping...'.format(args.cos_t_max))
            scheduler.step()
        
        # validate
        if args.use_swa and epoch >= args.swa_start:
            # validate use swa model
            corr, loss = validate(val_loader, swa_model, accuracy, epoch, None, None)
        else:
            corr, loss = validate(val_loader, model, accuracy, epoch, None, None)
        is_best = corr > best_corr
        best_corr = max(corr, best_corr)
        # tb_writer.add_scalar('acc/validate_corr_best', best_corr, epoch)
        # output_best = 'Best corr: %.4f\n' % (best_corr)
        # print(output_best)
        # save_checkpoint({
        #     'epoch': epoch + 1,
        #     'state_dict': model.state_dict(),
        #     'optimizer': optimizer.state_dict(),
        #     'best_corr': best_corr,
        # }, is_best)
        with tune.checkpoint_dir(epoch) as checkpoint_dir:
            path = os.path.join(checkpoint_dir, "checkpoint")
            if is_best:
                path = os.path.join(checkpoint_dir, "checkpoint_best")
            torch.save((model.state_dict(), optimizer.state_dict()), path)
        tune.report(loss=loss, accuracy=corr, best_corr=best_corr)
Exemple #20
0
    torch.manual_seed(args.seed)
    device = args.device

    if args.resnet:
        assert args.input_size == 224
        #model = Resnet(args.output_size)
        print('!!!!!!!!!!!!!!!!efficientnet load!!!!!!!!!!!!!!!!')
        model_name = 'efficientnet-b0'
        print(model_name)

        model = EfficientNet.from_name(model_name)

        #model = EfficientNet.from_pretrained(model_name, num_classes=350)
        #summary(model,input_size=(3,224,224))
    else:
        model = Baseline(args.hidden_size, args.output_size)
    optimizer = optim.Adam(model.parameters(), args.learning_rate)
    lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                        patience=1,
                                                        verbose=True)
    criterion = nn.CrossEntropyLoss()  #multi-class classification task

    model = model.to(device)
    model.train()

    # DONOTCHANGE: They are reserved for nsml
    bind_model(model)
    # below the nsml load
    nsml.load(checkpoint='15', session='team_62/airush1/40')
    nsml.save('stillgoing')
Exemple #21
0
        args.distributed = args.world_size > 1

    if args.distributed:
        # FOR DISTRIBUTED:  Set the device according to local_rank.
        torch.cuda.set_device(args.local_rank)

        # FOR DISTRIBUTED:  Initialize the backend.  torch.distributed.launch will provide
        # environment variables, and requires that you use init_method=`env://`.
        dist.init_process_group(backend='nccl',
                                init_method='env://')

    # Data loading code
    train_loader = get_dataloader(args, data_path=args.data_dir, data_name='train', batch_size=args.batch_size, num_workers=args.num_workers, distributed=args.distributed)
    val_loader = get_dataloader(args, data_path=args.data_dir, data_name='valid', batch_size=args.batch_size, num_workers=args.num_workers)
    # create model
    if not args.baseline:
        model = CCMModel(args, train_loader.dataset).to(device)
    else:
        model = Baseline(args).to(device)
        criterion = baseline_criterion
    optimizer = optim.Adam(model.parameters(), args.lr)
    if args.distributed:
        model = DDP(model)

    recorder = None
    if args.local_rank == 0:
        writer = SummaryWriter(f'{args.log_dir}/{args.project}_{"b" if args.baseline else "c"}_{args.timestamp}')
        recorder = Recorder(args, writer, train_loader.dataset.idx2word)

    train()