Exemple #1
0
    def distribute_run(self):
        strategy = tf.distribute.MirroredStrategy()
        train_global_batch = self.args.train_batch * strategy.num_replicas_in_sync
        val_global_batch = self.args.val_batch * strategy.num_replicas_in_sync
        train_date, train_batch_num, val_data, val_batch_num = get_datasets(
            name=self.args.dataset,
            train_batch=train_global_batch,
            val_batch=val_global_batch)
        with strategy.scope():
            model = get_net(arch=self.args.arch,
                            num_layers=self.args.num_layers,
                            num_experts=self.args.num_experts,
                            num_classes=self.args.num_classes)
            model.build(input_shape=(None, 32, 32, 3))
            model.summary()

            optimizer = tf.keras.optimizers.SGD(learning_rate=self.args.lr,
                                                momentum=0.9,
                                                decay=0.0001,
                                                nesterov=True)

            dis_trainer = DisTrainer(strategy=strategy,
                                     model=model,
                                     optimizer=optimizer,
                                     epochs=self.args.epochs,
                                     val_data=val_data,
                                     train_batch=self.args.train_batch,
                                     val_batch=self.args.val_batch,
                                     train_data=train_date,
                                     log_dir=self.log_dir,
                                     model_save_path=self.model_save_path,
                                     train_batch_num=train_batch_num,
                                     val_batch_num=val_batch_num)

            dis_trainer(resume=self.args.resume, val=self.args.val)
Exemple #2
0
def train_batch(epoch, net, opt, crit, batch_size):
    train_set = get_datasets('/home/fernand/math/data', 'train')
    train_loader = torch.utils.data.DataLoader(train_set,
                                               batch_size=BATCH_SIZE,
                                               shuffle=True,
                                               num_workers=6,
                                               collate_fn=collate_data,
                                               drop_last=True)
    pbar = tqdm(iter(train_loader))
    moving_loss = 0.0
    for questions, questions_len, answers, answers_len, answer_mappings in pbar:
        questions, questions_len = questions.to(DEVICE), questions_len.to(
            DEVICE)
        answers, answers_len = answers.to(DEVICE), answers_len.to(DEVICE)
        answer_mappings = answer_mappings.to(DEVICE)
        loss = net.train_batch(questions, questions_len, answers, answers_len,
                               answer_mappings, opt, crit)
        if moving_loss == 0.0:
            moving_loss = loss
        else:
            moving_loss = 0.95 * moving_loss + 0.05 * loss
        nn.utils.clip_grad_value_(net.parameters(), 0.1)
        pbar.set_description('Epoch: {}; Loss: {:.5f}'.format(
            epoch + 1, moving_loss))
    for d in train_set.datasets:
        d.close()
Exemple #3
0
    def run(self):
        train_date, train_batch_num, val_data, val_batch_num = get_datasets(
            name=self.args.dataset,
            train_batch=self.args.train_batch,
            val_batch=self.args.val_batch)
        model = get_net(arch=self.args.arch,
                        num_layers=self.args.num_layers,
                        num_experts=self.args.num_experts,
                        num_classes=self.args.num_classes)
        model.build(input_shape=(None, 32, 32, 3))
        model.summary()

        optimizer = tf.keras.optimizers.SGD(learning_rate=self.args.lr,
                                            momentum=0.9,
                                            decay=0.0001,
                                            nesterov=True)

        trainer = Trainer(model=model,
                          optimizer=optimizer,
                          epochs=self.args.epochs,
                          val_data=val_data,
                          train_batch=self.args.train_batch,
                          val_batch=self.args.val_batch,
                          train_data=train_date,
                          log_dir=self.log_dir,
                          model_save_path=self.model_save_path,
                          train_batch_num=train_batch_num,
                          val_batch_num=val_batch_num)

        trainer(resume=self.args.resume, val=self.args.val)
Exemple #4
0
def main(arch: str,
         image_folder: str,
         from_scratch: bool = False,
         batch_size: Optional[int] = None,
         from_model: Optional[str] = None,
         grad_accu: int = 1,
         num_gpus: int = 1,
         epochs: int = 100,
         lr: float = 4e-4):
    if arch.startswith("BiT"):
        base_model = BIT_MODELS[arch](head_size=-1)
        if not from_scratch and not from_model:
            print("Loading pretrained model...")
            base_model.load_from(np.load(f"cache/pretrained/{arch}.npz"))
        net_final_size = base_model.width_factor * 2048
    else:
        raise ValueError(f"arch '{arch}'' not supported")
    train_ds, valid_ds = get_datasets(image_folder, val_ratio=0.05)

    model = SelfSupervisedLearner(base_model,
                                  train_ds,
                                  valid_ds,
                                  epochs,
                                  lr,
                                  num_gpus=num_gpus,
                                  batch_size=batch_size if batch_size else 4,
                                  image_size=IMAGE_SIZE,
                                  projection_size=256,
                                  projection_hidden_size=4096,
                                  net_final_size=net_final_size,
                                  moving_average_decay=0.99)

    trainer = pl.Trainer(
        accelerator='ddp' if num_gpus > 1 else None,
        amp_backend="apex",
        amp_level='O2',
        precision=16,
        gpus=[1],  # num_gpus,
        val_check_interval=0.5,
        # gradient_clip_val=10,
        max_epochs=epochs,
    )

    trainer.fit(model)
Exemple #5
0
def main():
    logging.basicConfig(level=logging.INFO)

    in_features = 28 * 28
    hidden_dim = 1024
    out_features = 10
    batch_size = 128
    num_epochs = 10

    mlp = MLP.new(in_features, hidden_dim, out_features)
    # mlp = Linear.new(in_features, hidden_dim)
    rng = mlp.initialize()

    dloss_fn = jax.value_and_grad(loss_fn, has_aux=True)
    dloss_fn = jax.jit(dloss_fn)

    train_ds, val_ds = get_datasets()
    train_dl = NumpyLoader(
        train_ds,
        batch_size=batch_size,
        shuffle=True,
    )
    val_dl = NumpyLoader(val_ds, batch_size=batch_size)

    for epoch in range(num_epochs):

        for x, y in train_dl:
            x, y = np.array(x), np.array(y)

            (loss, (mlp, acc)), grads = dloss_fn(mlp, x, y)
            # # print('grad:', grad)
            print('train loss:', loss)
            print('train acc:', acc)

            mlp = jax.tree_multimap(sgd, mlp, grads)

        for x, y in val_dl:
            x, y = np.array(x), np.array(y)
            # out = mlp(x)

            loss, (mlp, acc) = loss_fn(mlp, x, y)

            print('val loss:', loss)
            print('val acc:', acc)
Exemple #6
0
def train_one(epoch, net, opt, crit):
    train_set = get_datasets('/home/fernand/math/data', 'train')
    train_loader = torch.utils.data.DataLoader(train_set,
                                               shuffle=True,
                                               num_workers=6)
    pbar = tqdm(iter(train_loader))
    moving_loss = 0.0
    for question, answer in pbar:
        question, answer = question.to(DEVICE), answer.to(DEVICE)
        loss = net.train(question, answer, opt, crit)
        if moving_loss == 0.0:
            moving_loss = loss
        else:
            moving_loss = 0.9999 * moving_loss + 0.0001 * loss
        nn.utils.clip_grad_value_(net.parameters(), 0.1)
        pbar.set_description('Epoch: {}; Loss: {:.5f}'.format(
            epoch + 1, moving_loss))
    for d in train_set.datasets:
        d.close()
Exemple #7
0
def main():
    datasets = get_datasets()

    min_points = 5
    eps = [20, 17, 11, 4]

    for i, dataset in enumerate(datasets):
        # Plot kdist plot to determine EPS param
        kdist_data = get_kdist_data(dataset, min_points)
        plot_data(kdist_data)

        # Get dbscan object
        dbscan = DBSCAN(min_points, eps[i])

        labels = dbscan.fit(dataset)

        print_labels(labels)

        plot_labeled_data(dataset, labels)
Exemple #8
0
def main():
    with tf.Graph().as_default():
        test_sets = dataset.get_datasets(main_path, EPIWidth, disp_precision,
                                         'test')

        images_placeholder_v = tf.placeholder(tf.float32,
                                              shape=(None, 9, EPIWidth, 1))
        images_placeholder_u = tf.placeholder(tf.float32,
                                              shape=(None, 9, EPIWidth, 1))
        prop_placeholder = tf.placeholder('float')
        phase_train = tf.placeholder(tf.bool, name='phase_train')

        logits = network.inference_ds(images_placeholder_u,
                                      images_placeholder_v, prop_placeholder,
                                      phase_train, EPIWidth, disp_precision)

        eval = network.evaluation(logits)

        saver = tf.train.Saver(tf.global_variables())

        gpu_option = tf.GPUOptions(per_process_gpu_memory_fraction=0.5)

        sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_option))

        sess.run(tf.global_variables_initializer())

        ckpt = tf.train.get_checkpoint_state(checkpoint_path)
        if ckpt:
            #            saver.restore(sess,checkpoint_path+'/model.ckpt')#利用不同平台的训练结果
            saver.restore(sess, ckpt.model_checkpoint_path)  #本地训练的结果
            print("restore from checkpoint!")
        else:
            print("no checkpoint found!")

        print('Training Data Eval:')
        do_eval_true(sess, eval, logits, images_placeholder_u,
                     images_placeholder_v, prop_placeholder, phase_train,
                     test_sets)
Exemple #9
0
    def __init__(self, args):

        # Training configurations
        self.method = args.method
        self.dataset = args.dataset
        self.dim = args.dim
        self.lr_init = args.lr_init
        self.gamma_m = args.gamma_m
        self.gamma_s = args.gamma_s
        self.batch_size = args.batch_size
        self.val_batch_size = self.batch_size // 2
        self.iteration = args.iteration
        self.evaluation = args.evaluation
        self.show_iter = 1000
        self.update_epoch = args.update_epoch
        self.balanced = args.balanced
        self.instances = args.instances
        self.inter_test = args.intertest
        self.cm = args.cm
        self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        self.n_class = args.batch_size // args.instances
        self.classes = args.classes
        self.pretrained = args.pretrained
        self.model_save_interval = args.model_save_interval


        self.file_name = '{}_{}_{}'.format(
            self.method,
            self.dataset,
            self.iteration,
        )
        print('========================================')
        print(json.dumps(vars(args), indent=2))
        print(self.file_name)

        # Paths

        self.root_dir = os.path.join('/', 'data')
        self.data_dir = os.path.join(self.root_dir, self.dataset)
        self.model_dir = self._get_path('./trained_model')
        self.plot_dir = self._get_path('./plot_model')
        self.code_dir = self._get_path(os.path.join('codes', self.dataset))
        self.fig_dir = self._get_path(os.path.join('fig', self.dataset, self.file_name))

        # Preparing data
        self.transforms = get_transform()
        self.datasets = get_datasets(dataset=self.dataset, data_dir=self.data_dir, transforms=self.transforms)

        self.data_loaders = get_data_loaders(
            datasets=self.datasets,
            batch_size=self.batch_size,
            val_batch_size=self.val_batch_size,
            n_instance=self.instances,
            balanced=self.balanced,
            #cm=self.cm_sampler if self.cm else None
        )
        self.dataset_sizes = {x: len(self.datasets[x]) for x in ['train', 'test']}


        self.mean = (torch.zeros((self.classes,self.classes)).add(1.5)-1.0*torch.eye(self.classes)).to(self.device)
        self.std = (torch.zeros((self.classes,self.classes)).add(0.15)).to(self.device)
        self.last_delta_mean = torch.zeros((self.classes,self.classes)).to(self.device)
        self.last_delta_std = torch.zeros((self.classes,self.classes)).to(self.device)

        
        self.ndmodel = nd.NDfdml(n_class=self.n_class,batch_size=self.batch_size,instances=self.instances,pretrained=self.pretrained).to(self.device)
        
        
        optimizer_c = optim.SGD(
            [
                {'params': self.ndmodel.googlelayer.parameters()},
                {'params': self.ndmodel.embedding_layer.parameters(), 'lr': self.lr_init * 10, 'momentum': 0.9}
            ],
            lr=self.lr_init, momentum=0.9
        )


        self.scheduler = lr_scheduler.StepLR(optimizer_c, step_size=4000, gamma=0.9)
Exemple #10
0
def main(args):
    checks()

    macn = BatchMACN(
        image_shape=[FLAGS.im_h, FLAGS.im_w, FLAGS.ch_i],
        vin_config=VINConfig(k=FLAGS.k, ch_h=FLAGS.ch_h, ch_q=FLAGS.ch_q),
        access_config={
            "memory_size": FLAGS.memory_size, 
            "word_size": FLAGS.word_size, 
            "num_reads": FLAGS.num_read_heads, 
            "num_writes": FLAGS.num_write_heads
        }, 
        controller_config={
            "hidden_size": FLAGS.hidden_size
        }, 
        batch_size=FLAGS.batch_size,
        seq_length=FLAGS.seq_length        
    )

    # y = [batch, labels]
    y = tf.placeholder(tf.int64, shape=[None, None], name='y') # labels : actions {0,1,2,3}

    # Training
    cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=macn.logits, name='cross_entropy')
    loss = tf.reduce_sum(cross_entropy, name='cross_entropy_mean')
    train_step = tf.train.RMSPropOptimizer(FLAGS.learning_rate, epsilon=1e-6, centered=True).minimize(loss)

    # Reporting
    y_ = tf.argmax(macn.prob_actions, axis=-1) # predicted action
    nb_errors = tf.reduce_sum(tf.to_float(tf.not_equal(y_, y))) # Number of wrongly selected actions

    def train_on_episode_batch(batch_images, batch_labels):
        _, _loss, _nb_err = sess.run([train_step, loss, nb_errors], feed_dict={macn.X : batch_images, y : batch_labels})
        return _loss, _nb_err
        
    def test_on_episode_batch(batch_images, batch_labels):
        return sess.run([loss, nb_errors], feed_dict={macn.X : batch_images, y : batch_labels})

    trainset, testset = get_datasets(FLAGS.dataset, test_percent=0.1)
    
    # Start training 
    saver = tf.train.Saver()
    with tf.Session() as sess:
        if loadfile_exists(FLAGS.load):
            saver.restore(sess, FLAGS.load)
            print("Weights reloaded")
        else:
            sess.run(tf.global_variables_initializer())
       
        print("Start training...")
        for epoch in range(1, FLAGS.epochs + 1):
            start_time = time.time()

            mean_loss, mean_accuracy = compute_on_dataset(sess, trainset, train_on_episode_batch)
            
            print('Epoch: {:3d} ({:.1f} s):'.format(epoch, time.time() - start_time))
            print('\t Train Loss: {:.5f} \t Train accuracy: {:.2f}%'.format(mean_loss, 100*(mean_accuracy)))

            saver.save(sess, FLAGS.save)
        print('Training finished.')
        


        print('Testing...')
        mean_loss, mean_accuracy = compute_on_dataset(sess, testset, test_on_episode_batch)
        print('Test Accuracy: {:.2f}%'.format(100*(mean_accuracy)))
Exemple #11
0
def main(
        arch: str, image_folder: str, from_scratch: bool = False,
        batch_size: Optional[int] = None,
        from_model: Optional[str] = None,
        grad_accu: int = 1,
        num_gpus: int = 1, epochs: int = 100, lr: float = 4e-4):
    pl.seed_everything(int(os.environ.get("SEED", 738)))
    if arch.startswith("BiT"):
        base_model = BIT_MODELS[arch](head_size=-1)
        if not from_scratch and not from_model:
            print("Loading pretrained model...")
            base_model.load_from(np.load(f"cache/pretrained/{arch}.npz"))
        net_final_size = base_model.width_factor * 2048
    else:
        raise ValueError(f"arch '{arch}'' not supported")
    train_ds, valid_ds = get_datasets(image_folder, val_ratio=0.05)

    model = SelfSupervisedLearner(
        base_model,
        train_ds,
        valid_ds,
        epochs,
        lr,
        num_gpus=num_gpus,
        batch_size=batch_size if batch_size else 4,
        image_size=IMAGE_SIZE,
        projection_size=256,
        projection_hidden_size=4096,
        net_final_size=net_final_size,
        moving_average_decay=0.995,
        use_momentum=True
    )

    if from_model:
        print("loading weights...")
        # Load pretrained-weights
        weights = torch.load(from_model)
        model.learner.online_encoder.projector.load_state_dict(
            weights["online_encoder_proj"])
        model.learner.online_encoder.net.load_state_dict(
            weights["online_encoder_net"])
        model.learner.online_predictor.load_state_dict(
            weights["online_predictor"])
        model.learner.target_encoder.net.load_state_dict(
            weights["target_encoder_net"])
        model.learner.target_encoder.projector.load_state_dict(
            weights["target_encoder_proj"])
        del weights

    trainer = pl.Trainer(
        accelerator='ddp' if num_gpus > 1 else None,
        amp_backend="apex", amp_level='O2',
        precision=16,
        gpus=num_gpus,
        val_check_interval=0.5,
        gradient_clip_val=10,
        max_epochs=epochs,
        callbacks=[
            LearningRateMonitor(logging_interval='step'),
            ModelCheckpoint(
                monitor='val_loss',
                filename='byol-{step:06d}-{val_loss:.4f}',
                save_top_k=2)
        ],
        accumulate_grad_batches=grad_accu,
        auto_scale_batch_size='power' if batch_size is None else None,
        # automatic_optimization=False
    )

    if batch_size is None:
        trainer.tune(model)

    trainer.fit(model)

    # model = SelfSupervisedLearner.load_from_checkpoint(
    #     "lightning_logs/version_20/checkpoints/byol-step=001135-val_loss=0.03.ckpt",
    #     net=base_model,
    #     train_dataset=train_ds,
    #     valid_dataset=valid_ds,
    #     epochs=epochs,
    #     learning_rate=lr,
    #     augment_fn=torch.nn.Sequential(
    #         T.RandomResizedCrop((IMAGE_SIZE, IMAGE_SIZE)),
    #         RandomApply(
    #             T.ColorJitter(0.8, 0.8, 0.8, 0.2),
    #             p=0.3
    #         ),
    #         T.RandomGrayscale(p=0.2),
    #         T.RandomHorizontalFlip(),
    #         RandomApply(
    #             T.GaussianBlur((3, 3), (1.0, 2.0)),
    #             p=0.2
    #         ),
    #         T.Normalize(
    #             mean=torch.tensor([0.485, 0.456, 0.406]),
    #             std=torch.tensor([0.229, 0.224, 0.225])
    #         )
    #     ),
    #     num_gpus=num_gpus,
    #     batch_size=batch_size if batch_size else 4,
    #     image_size=IMAGE_SIZE,
    #     hidden_layer=-1,
    #     projection_size=256,
    #     projection_hidden_size=4096,
    #     net_final_size=net_final_size,
    #     moving_average_decay=0.99
    # )
    # trainer = pl.Trainer(
    #     resume_from_checkpoint="lightning_logs/version_20/checkpoints/byol-step=001135-val_loss=0.03.ckpt")

    if num_gpus == 1 or torch.distributed.get_rank() == 0:
        torch.save({
            "online_encoder_proj":
            model.learner.online_encoder.projector.state_dict(),
            "online_encoder_net":
            model.learner.online_encoder.net.state_dict(),
            "online_predictor":
            model.learner.online_predictor.state_dict(),
            "target_encoder_net":
            model.learner.target_encoder.net.state_dict(),
            "target_encoder_proj":
            model.learner.target_encoder.projector.state_dict(),
            "config": {
                "arch": arch
            }
        }, f"cache/byol_{arch}.pth")
        print("Model saved")
Exemple #12
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--seed', type=int, default=42, help='Random seed')

    parser.add_argument('-dd',
                        '--data-dir',
                        type=str,
                        default='data',
                        help='Data directory')

    parser.add_argument('-l',
                        '--loss',
                        type=str,
                        default='label_smooth_cross_entropy')
    parser.add_argument('-t1', '--temper1', type=float, default=0.2)
    parser.add_argument('-t2', '--temper2', type=float, default=4.0)
    parser.add_argument('-optim', '--optimizer', type=str, default='adam')

    parser.add_argument('-prep', '--prep_function', type=str, default='none')

    parser.add_argument('--train_on_different_datasets', action='store_true')
    parser.add_argument('--use-current', action='store_true')
    parser.add_argument('--use-extra', action='store_true')
    parser.add_argument('--use-unlabeled', action='store_true')

    parser.add_argument('--fast', action='store_true')
    parser.add_argument('--mixup', action='store_true')
    parser.add_argument('--balance', action='store_true')
    parser.add_argument('--balance-datasets', action='store_true')

    parser.add_argument('--show', action='store_true')
    parser.add_argument('-v', '--verbose', action='store_true')

    parser.add_argument('-m',
                        '--model',
                        type=str,
                        default='efficientnet-b4',
                        help='')
    parser.add_argument('-b',
                        '--batch-size',
                        type=int,
                        default=8,
                        help='Batch Size during training, e.g. -b 64')
    parser.add_argument('-e',
                        '--epochs',
                        type=int,
                        default=100,
                        help='Epoch to run')
    parser.add_argument('-s',
                        '--sizes',
                        default=380,
                        type=int,
                        help='Image size for training & inference')
    parser.add_argument('-f', '--fold', type=int, default=None)
    parser.add_argument('-t', '--transfer', default=None, type=str, help='')
    parser.add_argument('-lr',
                        '--learning_rate',
                        type=float,
                        default=1e-4,
                        help='Initial learning rate')
    parser.add_argument('-a',
                        '--augmentations',
                        default='medium',
                        type=str,
                        help='')
    parser.add_argument('-accum', '--accum-step', type=int, default=1)
    parser.add_argument('-metric', '--metric', type=str, default='accuracy01')

    args = parser.parse_args()

    diff_dataset_train = args.train_on_different_datasets

    data_dir = args.data_dir
    epochs = args.epochs
    batch_size = args.batch_size
    seed = args.seed

    loss_name = args.loss
    optim_name = args.optimizer

    prep_function = args.prep_function

    model_name = args.model
    size = args.sizes,
    print(size)
    print(size[0])
    image_size = (size[0], size[0])
    print(image_size)
    fast = args.fast
    fold = args.fold
    mixup = args.mixup
    balance = args.balance
    balance_datasets = args.balance_datasets
    show_batches = args.show
    verbose = args.verbose
    use_current = args.use_current
    use_extra = args.use_extra
    use_unlabeled = args.use_unlabeled

    learning_rate = args.learning_rate
    augmentations = args.augmentations
    transfer = args.transfer
    accum_step = args.accum_step

    #cosine_loss    accuracy01
    main_metric = args.metric

    print(data_dir)

    num_classes = 5

    assert use_current or use_extra

    print(fold)

    current_time = datetime.now().strftime('%b%d_%H_%M')
    random_name = get_random_name()

    current_time = datetime.now().strftime('%b%d_%H_%M')
    random_name = get_random_name()

    # if folds is None or len(folds) == 0:
    #     folds = [None]

    torch.cuda.empty_cache()
    checkpoint_prefix = f'{model_name}_{size}_{augmentations}'

    if transfer is not None:
        checkpoint_prefix += '_pretrain_from_' + str(transfer)
    else:
        if use_current:
            checkpoint_prefix += '_current'
        if use_extra:
            checkpoint_prefix += '_extra'
        if use_unlabeled:
            checkpoint_prefix += '_unlabeled'
        if fold is not None:
            checkpoint_prefix += f'_fold{fold}'

    directory_prefix = f'{current_time}_{checkpoint_prefix}'
    log_dir = os.path.join('runs', directory_prefix)
    os.makedirs(log_dir, exist_ok=False)

    set_manual_seed(seed)
    model = get_model(model_name)

    if transfer is not None:
        print("Transfering weights from model checkpoint")
        model.load_state_dict(torch.load(transfer)['model_state_dict'])

    model = model.cuda()

    if diff_dataset_train:
        train_on = ['current_train', 'extra_train']
        valid_on = ['unlabeled']
        train_ds, valid_ds, train_sizes = get_datasets_universal(
            train_on=train_on,
            valid_on=valid_on,
            image_size=image_size,
            augmentation=augmentations,
            target_dtype=int,
            prep_function=prep_function)
    else:
        train_ds, valid_ds, train_sizes = get_datasets(
            data_dir=data_dir,
            use_current=use_current,
            use_extra=use_extra,
            image_size=image_size,
            prep_function=prep_function,
            augmentation=augmentations,
            target_dtype=int,
            fold=fold,
            folds=5)

    train_loader, valid_loader = get_dataloaders(train_ds,
                                                 valid_ds,
                                                 batch_size=batch_size,
                                                 train_sizes=train_sizes,
                                                 num_workers=6,
                                                 balance=True,
                                                 balance_datasets=True,
                                                 balance_unlabeled=False)

    loaders = collections.OrderedDict()
    loaders["train"] = train_loader
    loaders["valid"] = valid_loader

    runner = SupervisedRunner(input_key='image')

    criterions = get_loss(loss_name)
    # criterions_tempered = TemperedLogLoss()
    # optimizer = catalyst.contrib.nn.optimizers.radam.RAdam(model.parameters(), lr = learning_rate)
    optimizer = get_optim(optim_name, model, learning_rate)
    # optimizer = catalyst.contrib.nn.optimizers.Adam(model.parameters(), lr = learning_rate)
    # criterions = nn.CrossEntropyLoss()
    # optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    # scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[25], gamma=0.8)
    # cappa = CappaScoreCallback()

    Q = math.floor(len(train_ds) / batch_size)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=Q)
    if main_metric != 'accuracy01':
        callbacks = [
            AccuracyCallback(num_classes=num_classes),
            CosineLossCallback(),
            OptimizerCallback(accumulation_steps=accum_step),
            CheckpointCallback(save_n_best=epochs)
        ]
    else:
        callbacks = [
            AccuracyCallback(num_classes=num_classes),
            OptimizerCallback(accumulation_steps=accum_step),
            CheckpointCallback(save_n_best=epochs)
        ]

    # main_metric = 'accuracy01'

    runner.train(
        fp16=True,
        model=model,
        criterion=criterions,
        optimizer=optimizer,
        scheduler=scheduler,
        callbacks=callbacks,
        loaders=loaders,
        logdir=log_dir,
        num_epochs=epochs,
        verbose=verbose,
        main_metric=main_metric,
        minimize_metric=False,
    )
Exemple #13
0
from dataset import get_datasets

loss_func = get_loss_func(args)
metric_func = get_metric_func(args)

device = torch.device(
    "cuda" if args.gpu and torch.cuda.is_available() else "cpu")
kwargs = {'num_workers': 1, 'pin_memory': True} if args.gpu else {}

print('Device:', device)
# print('Task:', args.task)
print('Setting:', args.setting)
if args.spdz: print('Using SPDZ for FedAvg')
print('Local epochs:', args.local_epochs)

trn_party_datasets, val_party_datasets, tst_party_datasets = get_datasets(
    args)  # each party's loader and the combined loader
assert len(trn_party_datasets) == len(val_party_datasets) and len(
    val_party_datasets) == len(tst_party_datasets)
trn_combined_dataset = ConcatDataset(trn_party_datasets)

num_parties = len(trn_party_datasets)
assert num_parties > 1

if args.dp:
    party_samplers = [
        RandomSampler(trnset, replacement=True)
        for trnset in trn_party_datasets
    ]
    combined_sampler = RandomSampler(trn_combined_dataset, replacement=True)
    trn_party_loaders = [DataLoader(trnset, sampler=sampler, batch_size=args.batch_size, shuffle=False, **kwargs) \
                            for trnset, sampler in zip(trn_party_datasets, party_samplers)]
Exemple #14
0
def hp_search(seed,
              data,
              model_type,
              mode,
              device,
              batch_size,
              embedding_dim,
              hidden_dim,
              num_layers,
              bidirectional,
              dropout,
              batch_first,
              epochs,
              lr,
              clip_grad,
              max_norm,
              early_stopping_patience,
              train_frac,
              val_frac,
              test_frac,
              subset_size,
              log_interval,
              no_tb,
              w_loss,
              w_sampling):

    # set seed for reproducibility on cpu or gpu based on availability
    torch.manual_seed(seed) if device == 'cpu' else torch.cuda.manual_seed(seed)

    # set starting time of full training pipeline
    start_time = datetime.now()

    # set device
    device = torch.device(device)
    print(f"Device: {device}")

    # data_path = 'data/bnc/bnc_subset_19_29_vs_50_plus_nfiles_0.csv' if data == 'bnc' else 'data/blogs_kaggle/blogtext.csv'

    if data == 'bnc':
        data_path = 'data/bnc/bnc_subset_19_29_vs_50_plus_nfiles_0.csv'
    elif data == 'bnc_rb':
        data_path = 'data/bnc/bnc_subset_19_29_vs_50_plus_nfiles_0_rand_balanced.csv'
    else:
        data_path = 'data/blogs_kaggle/blogtext.csv'

    print("Starting data preprocessing ... ")
    data_prep_start = datetime.now()

    # Load data and create dataset instances
    train_dataset, val_dataset, test_dataset = get_datasets(subset_size=subset_size,
                                                            file_path=data_path,
                                                            train_frac=train_frac,
                                                            val_frac=val_frac,
                                                            test_frac=test_frac,
                                                            seed=seed,
                                                            data=data)

    print('-' * 91)
    print('BASELINES//VALUE COUNTS')
    print('Train')
    print(train_dataset.df['age_cat'].value_counts(normalize=True))
    print('Validation')
    print(val_dataset.df['age_cat'].value_counts(normalize=True))
    print('-' * 91)

    # Train, val, and test splits
    # train_size = int(train_frac * len(dataset))
    # test_size = len(dataset) - train_size
    # train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

    # get vocab size number of classes
    # vocab_size = train_dataset.vocab_size
    # num_classes = train_dataset.num_classes

    # create dataloaders with pre-specified batch size
    # data_loader = DataLoader(dataset=dataset,
    #                          batch_size=batch_size,
    #                          shuffle=True,
    #                          collate_fn=PadSequence())

    print('-' * 91)
    print(f'Data preprocessing finished. Data prep took {datetime.now() - data_prep_start}.')

    print('######### DATA STATS ###############')
    print(f'Number of classes: {train_dataset.num_classes}')
    print(f'Vocabulary size: {train_dataset.vocab_size}')
    print(f'Training set size: {train_dataset.__len__()}')
    print(f'Validation set size: {val_dataset.__len__()}')
    print(f'Test set size: {test_dataset.__len__()}')
    print('-' * 91)

    # Set hyperparameters for grid search*
    # seeds = [0, 1, 2]
    lrs = [1e-5, 1e-4, 1e-3]
    embedding_dims = [64, 256, 512]
    hidden_dims = [128, 512, 1024]
    nums_layers = [1, 2]
    bidirectionals = [False, True]

    # weighting = [(True, False), (False, True)] # [(w_loss = True, w_sampling = False), (w_loss = False, w_sampling = True)]


    # set holders for best performance metrics and corresponding hyperparameters
    best_metrics = {'loss' : float("inf"),
                    'acc' : float('-inf')}
    best_hps = {'lr' : None,
                'embedding_dim' : None,
                'hidden_dim' : None,
                'num_layers': None,
                'bidirectional' : None}

    best_model = None # TODO: what's the appropriate type for this?

    #TODO: add tqdm's and print statements to these loops for progress monitoring

    best_file_name = None
    best_epoch = None

    # For keeping track of metrics for all configs
    keys = ['lr', 'emb_dim', 'hid_dim', 'n_layers', 'bd', 'val_acc', 'val_loss']
    df = pd.DataFrame(columns=keys)

    best_model_updates = -1

    for lr_ in lrs:
        for emb_dim in embedding_dims:
            for hid_dim in hidden_dims:
                # skip if hidden size not larger than embedding dim
                if not hid_dim > emb_dim:
                    continue

                for n_layers in nums_layers:
                    for bd in bidirectionals:

                        print('-' * 91)
                        print(f"| Current config: lr: {lr_} | emb: {emb_dim} | hid_dim: {hid_dim} | n_layers: {n_layers} "
                              f"| bd: {bd} | ")
                        print('-' * 91)

                        # Create detailed experiment tag for tensorboard summary writer
                        cur_datetime = datetime.now().strftime('%d_%b_%Y_%H_%M_%S')
                        file_name = f'lstm_emb_{emb_dim}_hid_{hid_dim}_l_{n_layers}_' \
                                    f'bd_{bd}_drop_{dropout}_bs_{batch_size}_epochs_{epochs}_' \
                                    f'lr_{lr_}_subset_{subset_size}_train_{train_frac}_val_{val_frac}_' \
                                    f'test_{test_frac}_clip_{clip_grad}_maxnorm_{max_norm}' \
                                    f'es_{early_stopping_patience}_seed_{seed}_device_{device}_dt_{cur_datetime}'

                        if not no_tb:
                            # # Create detailed experiment tag for tensorboard summary writer
                            # cur_datetime = datetime.now().strftime('%d_%b_%Y_%H_%M_%S')
                            log_dir = f'runs/hp_search/{data}/'
                            # file_name = f'lstm_emb_{emb_dim}_hid_{hid_dim}_l_{n_layers}_' \
                            #             f'bd_{bd}_drop_{dropout}_bs_{batch_size}_epochs_{epochs}_' \
                            #             f'lr_{lr_}_subset_{subset_size}_train_{train_frac}_val_{val_frac}_' \
                            #             f'test_{test_frac}_clip_{clip_grad}_maxnorm_{max_norm}' \
                            #             f'es_{early_stopping_patience}_seed_{seed}_device_{device}_dt_{cur_datetime}'

                            # create summary writer instance for logging
                            log_path = log_dir+file_name
                            writer = SummaryWriter(log_path)
                        else:
                            writer = None

                        # train model (in val mode)
                        loss, acc, model, epoch, optimizer = train(mode=mode, data=data, seed=seed, device=device,
                                                                   batch_size=batch_size, embedding_dim=emb_dim,
                                                                   hidden_dim=hid_dim, num_layers=n_layers,
                                                                   bidirectional=bd, dropout=dropout,
                                                                   batch_first=batch_first, epochs=epochs,
                                                                   lr=lr_, clip_grad=clip_grad, max_norm=max_norm,
                                                                   early_stopping_patience=early_stopping_patience,
                                                                   train_frac=train_frac, val_frac=val_frac,
                                                                   test_frac=test_frac, subset_size=subset_size,
                                                                   log_interval=log_interval, writer=writer,
                                                                   train_dataset=train_dataset, val_dataset=val_dataset,
                                                                   test_dataset=test_dataset, no_tb=no_tb, w_loss=w_loss,
                                                                   w_sampling=w_sampling)

                        if not no_tb:
                            # close tensorboard summary writer
                            writer.close()


                        # Update metric logging dataframe
                        df.loc[0 if pd.isnull(df.index.max()) else df.index.max() + 1] = [lr_] + [emb_dim] + [hid_dim] \
                                                                                         + [n_layers] + [bd] + [acc] + \
                                                                                         [loss.item()]

                        # Save metric logging dataframe to csv
                        # cur_datetime = datetime.now().strftime('%d_%b_%Y_%H_%M_%S')
                        df.to_csv(
                            f'output/{data}_lstm_hp_search_metrics.csv', index=False
                        )

                        # update best ...
                        if acc > best_metrics['acc']:

                            best_model_updates +=1

                            # ... metrics
                            best_metrics['acc'] = acc
                            best_metrics['loss'] = loss

                            best_epoch = epoch

                            # ... hyperparams
                            best_hps['lr'] = lr_
                            best_hps['embedding_dim'] = emb_dim
                            best_hps['hidden_dim'] = hid_dim
                            best_hps['num_layers'] = n_layers
                            best_hps['bidirectional'] = bd

                            # ... model
                            best_model = deepcopy(model)

                            # ... optimizer
                            best_optimizer = deepcopy(optimizer)

                            # filename
                            best_file_name = file_name

                            # Delete previous current best model checkpoint file
                            for filename in glob.glob(f"models/{data}/lstm/cur_best_*"):
                                os.remove(filename)

                                # save current best model checkpoint
                            # Save best model checkpoint
                            model_dir = f'models/{data}/lstm/'
                            Path(model_dir).mkdir(parents=True, exist_ok=True)
                            model_path = model_dir + 'cur_best_' + best_file_name + '.pt'

                            torch.save({
                                'epoch': best_epoch,
                                'model_state_dict': best_model.state_dict(),
                                'optimizer_state_dict': best_optimizer.state_dict(),
                                'loss': best_metrics['loss'],
                                'acc': best_metrics['acc']
                            }, model_path)

                            print("New current best model found.")
                            print(f'Current best hyperparameters: {best_hps}')
                            print(f'Current best model: {best_model}')
                            print(f'Current best metrics: {best_metrics}')

    # # Save metric logging dataframe to csv
    # df.to_csv(
    #     'output/blog_lstm_hp_search_metrics.csv',
    #     index=False
    # )

    # Save best model checkpoint
    model_dir = f'models/{data}/lstm/'
    Path(model_dir).mkdir(parents=True, exist_ok=True)
    model_path = model_dir + 'best_' + best_file_name + '.pt'

    torch.save({
        'epoch': best_epoch,
        'model_state_dict': best_model.state_dict(),
        'optimizer_state_dict': best_optimizer.state_dict(),
        'loss': best_metrics['loss'],
        'acc': best_metrics['acc']
    }, model_path)

    print("Finished hyperparameter search.")
    print(f'Best hyperparameters: {best_hps}')
    print(f'Best model: {best_model}')
    print(f'Best metrics: {best_metrics}')

    # Delete equivalent cur_best file
    for filename in glob.glob("models/blog/lstm/cur_best_*"):
        os.remove(filename)

    print(f"Best model updates: {best_model_updates}")
Exemple #15
0
def train(seed,
          data,
          model_type,
          mode,
          device,
          batch_size,
          embedding_dim,
          hidden_dim,
          num_layers,
          bidirectional,
          dropout,
          batch_first,
          epochs,
          lr,
          clip_grad,
          max_norm,
          early_stopping_patience,
          train_frac,
          val_frac,
          test_frac,
          subset_size,
          log_interval,
          no_tb,
          w_loss,
          w_sampling,
          writer=None,
          train_dataset=None,
          val_dataset=None,
          test_dataset=None):


    if mode =='train' or mode == 'test':
        # set seed for reproducibility on cpu or gpu based on availability
        torch.manual_seed(seed) if device == 'cpu' else torch.cuda.manual_seed(seed)

        # data_path = 'data/bnc/bnc_subset_19_29_vs_50_plus_nfiles_0.csv' if data == 'bnc' else 'data/blogs_kaggle/blogtext.csv'
        # data_path = 'data/bnc/bnc_subset_19_29_vs_50_plus_nfiles_0_rand_balanced.csv' if data == 'bnc' else 'data/blogs_kaggle/blogtext.csv'
        if data == 'bnc':
            data_path = 'data/bnc/bnc_subset_19_29_vs_50_plus_nfiles_0.csv'
        elif data == 'bnc_rb':
            data_path = 'data/bnc/bnc_subset_19_29_vs_50_plus_nfiles_0_rand_balanced.csv'
        else:
            data_path = 'data/blogs_kaggle/blogtext.csv'

        # set starting time of full training pipeline
        start_time = datetime.now()

        # set device
        device = torch.device(device)
        print(f"Device: {device}")

        print("Starting data preprocessing ... ")
        data_prep_start = datetime.now()

        # Load data and create dataset instances
        train_dataset, val_dataset, test_dataset = get_datasets(subset_size=subset_size,
                                                                file_path=data_path,
                                                                train_frac=train_frac,
                                                                val_frac=val_frac,
                                                                test_frac=test_frac,
                                                                seed=seed,
                                                                data=data,
                                                                model_type=model_type)

        # Train, val, and test splits
        # train_size = int(train_frac * len(dataset))
        # test_size = len(dataset) - train_size
        # train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])


        # # get vocab size number of classes
        # vocab_size = train_dataset.vocab_size
        # num_classes = train_dataset.num_classes

        # create dataloaders with pre-specified batch size
        # data_loader = DataLoader(dataset=dataset,
        #                          batch_size=batch_size,
        #                          shuffle=True,
        #                          collate_fn=PadSequence())


        print('-' * 91)
        print(f'Data preprocessing finished. Data prep took {datetime.now() - data_prep_start}.')

        print(31*'-' + ' DATASET STATS AND BASELINES ' + '-'*31)
        print(f'Number of classes: {train_dataset.num_classes}')
        print(f'Vocabulary size: {train_dataset.vocab_size}')
        print(f'Training set size: {train_dataset.__len__()}')
        print(f'Validation set size: {val_dataset.__len__()}')
        print(f'Test set size: {test_dataset.__len__()}')
        print(91 * '-')
        print('Baselines')
        print('Train')
        print(train_dataset.df['age_cat'].value_counts(normalize=True))
        print('Validation')
        print(val_dataset.df['age_cat'].value_counts(normalize=True))
        print('Test')
        print(test_dataset.df['age_cat'].value_counts(normalize=True))
        print('-' * 91)


    if w_sampling:
        # Apply weighted sampling.

        # Inspired by: https://towardsdatascience.com/address-class-imbalance-easily-with-pytorch-e2d4fa208627

        # TODO: isn't this a bit redundant? Doesn't torch.tensor(train_dataset.df['age_cat'], dtype=torch.long) do the same?
        all_label_ids = torch.tensor([label for label in train_dataset.df['age_cat']], dtype=torch.long)

        # Class weighting
        labels_unique, counts = np.unique(train_dataset.df['age_cat'], return_counts=True)
        print(f'Unique labels: {labels_unique}')

        class_weights = [sum(counts) / c for c in counts] # [#{class_0}, {#class_1}, etc.]

        # Assign weights to each input sample
        sampler_weights = [class_weights[label] for label in train_dataset.df['age_cat']]
        sampler = WeightedRandomSampler(weights=sampler_weights, num_samples=len(train_dataset.df['age_cat']), replacement=True)

        # Note that sampler option is mutually exclusive with shuffle. So shuffle not needed here.
        train_loader = DataLoader(dataset=train_dataset,
                                  batch_size=batch_size,
                                  shuffle=False,
                                  collate_fn=PadSequence(),
                                  sampler=sampler)
    else:

        train_loader = DataLoader(dataset=train_dataset,
                                  batch_size=batch_size,
                                  shuffle=True,
                                  collate_fn=PadSequence())

    val_loader = DataLoader(dataset=val_dataset,
                            batch_size=batch_size,
                            shuffle=False,
                            collate_fn=PadSequence())

    test_loader = DataLoader(dataset=test_dataset,
                             batch_size=batch_size,
                             shuffle=False,
                             collate_fn=PadSequence())

    if mode == 'train' or mode == 'val':

        if model_type == 'lstm':
            # initialize model
            print("Initializing model ...")
            model = TextClassificationLSTM(batch_size = batch_size,
                                           vocab_size = train_dataset.vocab_size,
                                           embedding_dim = embedding_dim,
                                           hidden_dim = hidden_dim,
                                           num_classes = train_dataset.num_classes,
                                           num_layers = num_layers,
                                           bidirectional = bidirectional,
                                           dropout = dropout,
                                           device = device,
                                           batch_first = batch_first)
        elif model_type == 'bert':
            model = TextClassificationBERT()

    elif mode == 'test':
        if model_type == 'lstm':
            model, _, _, _, _ = load_saved_model(model_class=TextClassificationLSTM, optimizer_class=optim.Adam, lr=lr,
                                                 device=device, batch_size=batch_size, vocab_size=train_dataset.vocab_size,
                                                 embedding_dim=embedding_dim, hidden_dim=hidden_dim,
                                                 num_classes=train_dataset.num_classes, num_layers=num_layers,
                                                 bidirectional=bidirectional, dropout=dropout, batch_first=batch_first)
        elif model_type == 'bert':
            model = TextClassificationBERT()



    # model to device
    model.to(device)


    # Print model architecture and trainable parameters
    print('-' * 91)
    print("MODEL ARCHITECTURE:")
    print(model)
    print('-' * 91)

    if w_loss:

        # Apply frequency-based weighted loss for highly imbalanced data
        n_samples = [train_dataset.df['age_cat'].value_counts()[label] for label in range(train_dataset.num_classes)]

        # Weight option 1
        weights = [1 - (x / sum(n_samples)) for x in n_samples]
        weights = torch.FloatTensor(weights).to(device)


        # OR 2) have the weights sum up to 1??
        # weights = torch.tensor(n_samples, dtype=torch.float32).to(device)
        # weights = weights / weights.sum()
        # weights = 1.0 / weights
        # weights = weights / weights.sum()


        criterion = torch.nn.CrossEntropyLoss(weight=weights)  # combines LogSoftmax and NLL
    else:
        criterion = torch.nn.CrossEntropyLoss()  # combines LogSoftmax and NLL

    if mode == 'train' or mode == 'val':

        # count trainable parameters
        trainable_params = sum(param.numel() for param in model.parameters() if param.requires_grad)
        print(f'The model has {trainable_params} trainable parameters.')

        # set up optimizer and loss criterion
        if model_type == 'lstm':
            optimizer = optim.Adam(params=model.parameters(), lr=lr)
        elif model_type == 'bert':
            optimizer = optim.Adam(model.parameters(), lr=2e-5) #TODO: CHANGE THIS BACK!!!!!!!

        # initialize iterations at zero
        iterations = 0

        # values for model selection
        best_val_loss = torch.tensor(np.inf, device=device)
        best_val_accuracy = torch.tensor(-np.inf, device=device)
        best_epoch = None
        best_model = None

        # Initialize patience for early stopping
        patience = 0

        # metrics for losses
        train_losses = []
        train_accs = []

        # disable tqdm progress bars in train and train_one_epoch if in validation mode
        disable_bars = mode == 'val'

        # for epoch in tqdm(range(epochs), disable=disable_bars):
        for epoch in range(epochs):

            epoch_start_time = datetime.now()
            # epoch_start_time = time.time()
            try:
                # set model to training mode. NB: in the actual training loop later on, this
                # statement goes at the beginning of each epoch.
                model.train()
                iterations, train_losses, train_accs = train_one_epoch(model=model, model_type=model_type,
                                                                       data_loader=train_loader,
                                                                       criterion=criterion,
                                                                       optimizer=optimizer, device=device,
                                                                       start_iteration=iterations,
                                                                       clip_grad=clip_grad, max_norm=max_norm,
                                                                       log_interval=log_interval,
                                                                       losses=train_losses, accs=train_accs, writer=writer,
                                                                       disable_bars=disable_bars,
                                                                       epoch=epoch)

            except KeyboardInterrupt:
                print("Manually stopped current epoch")
                __import__('pdb').set_trace()

            # print("Current epoch training took {}".format(datetime.now() - epoch_start_time))
            val_loss, val_accuracy = evaluate_performance(model=model,
                                                          data_loader=val_loader,
                                                          device=device,
                                                          criterion=criterion,
                                                          writer=writer,
                                                          global_iteration=iterations,
                                                          print_metrics=False,
                                                          data=data)
            # TODO: See this tutorials prettier logging -- https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html
            # print(f"#######################################################################")
            # print(f"Epoch {epoch + 1} finished, validation loss: {val_loss}, val acc: {val_accuracy}")
            # print(f"#######################################################################")

            print('-' * 91)
            print('| end of epoch {:3d} | time: {} | '
                  'valid loss {:8.5f} | valid accuracy {:8.3f} '.format(epoch + 1,
                                                                        (datetime.now() - epoch_start_time),
                                                                        val_loss, val_accuracy))
            print('-' * 91)

            # # update best performance
            # if val_loss < best_val_loss:
            #     best_val_loss = val_loss
            #     best_val_accuracy = val_accuracy
            #     best_model = model
            #     best_epoch = epoch + 1

            # update best performance
            if val_accuracy > best_val_accuracy:
                best_val_loss = val_loss
                best_val_accuracy = val_accuracy
                best_model = deepcopy(model)
                best_optimizer = deepcopy(optimizer)
                best_epoch = epoch + 1
                patience = 0
            else:
                patience +=1
                if patience >= early_stopping_patience:
                    print("EARLY STOPPING")
                    break
        # TODO: See this tutorials prettier logging -- https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html
        print('-' * 91)
        print(f"Done training and validating. Best model from epoch {best_epoch}:")
        print(best_model)
        print('-' * 91)

    if mode == 'val':
        return best_val_loss, best_val_accuracy, best_model, best_epoch, best_optimizer
    elif mode == 'train':
        print("Starting testing...")
        _, _ = evaluate_performance(model=best_model, data_loader=test_loader,
                                                  device=device,
                                                  criterion=criterion,
                                                  set='test',
                                                  data=data,
                                                  plot_cm=True)
    elif mode == 'test':
        print("Starting testing...")
        _, _ = evaluate_performance(model=model, data_loader=test_loader,
                                                  device=device,
                                                  criterion=criterion,
                                                  set='test',
                                                  data=data,
                                                  plot_cm=True)
    parser.add_argument('--interval', type=int, default=1)

    args = parser.parse_args()

    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpus

    create_exp_dir(args)
    mp.set_start_method("spawn")

    population = mp.Queue(maxsize=args.B)
    finish_tasks = mp.Queue(maxsize=args.B)
    test_outputs = mp.Queue()
    epoch = mp.Value('i', 0)
    lock = mp.Lock()

    resources=[]
    print('Using resources:')
    for i in range(1,len(args.gpus.split(','))):
        for j in range(args.num_per_gpu):
            resources.append(f'cuda:{i}')
            print(f'cuda:{i}')

    datasets = get_datasets()

    Processes = [Samples(datasets, epoch, lock, population, finish_tasks, resources[i], args)
                 for i in range(len(resources))]
    Processes.append(Optimizer(datasets, epoch, lock, population, finish_tasks, 'cuda:0', args))

    [p.start() for p in Processes]
    [p.join() for p in Processes]
Exemple #17
0
    def __init__(self, args):

        # Training configurations
        self.method = args.method
        self.dataset = args.dataset
        self.dim = args.dim
        self.lr = args.lr
        self.batch_size = args.batch_size
        self.val_batch_size = self.batch_size // 2
        self.iteration = args.iteration
        self.evaluation = args.evaluation
        self.show_iter = 1000
        self.update_epoch = 10
        self.balanced = args.balanced
        self.instances = args.instances
        self.cm = args.cm
        self.device = torch.device(
            'cuda:0' if torch.cuda.is_available() else 'cpu')

        self.file_name = '{}_{}_{}'.format(
            self.method,
            self.dataset,
            self.lr,
        )
        print('========================================')
        print(json.dumps(vars(args), indent=2))
        print(self.file_name)

        # Paths

        self.root_dir = os.path.join('/', 'home', 'lyz')
        self.data_dir = os.path.join(self.root_dir, 'datasets', self.dataset)
        self.model_dir = self._get_path('./trained_model')
        self.code_dir = self._get_path(os.path.join('codes', self.dataset))
        self.fig_dir = self._get_path(
            os.path.join('fig', self.dataset, self.file_name))

        # Preparing data
        self.transforms = get_transform()
        self.datasets = get_datasets(dataset=self.dataset,
                                     data_dir=self.data_dir,
                                     transforms=self.transforms)
        self.cm_sampler = ClassMiningSampler(self.datasets['train'],
                                             batch_size=self.batch_size,
                                             n_instance=self.instances,
                                             balanced=self.balanced)
        self.data_loaders = get_data_loaders(
            datasets=self.datasets,
            batch_size=self.batch_size,
            val_batch_size=self.val_batch_size,
            n_instance=self.instances,
            balanced=self.balanced,
            cm=self.cm_sampler if self.cm else None)
        self.dataset_sizes = {
            x: len(self.datasets[x])
            for x in ['train', 'test']
        }

        # Set up model
        self.model = get_model(self.device, self.dim)

        self.optimizer = optim.SGD(
            [{
                'params': self.model.google_net.parameters()
            }, {
                'params': self.model.linear.parameters(),
                'lr': self.lr * 10,
                'momentum': 0.9
            }],
            lr=self.lr,
            momentum=0.9)
        self.scheduler = lr_scheduler.StepLR(self.optimizer,
                                             step_size=2000,
                                             gamma=0.5)
Exemple #18
0
def main():
    print "initial model generator"
    with tf.Graph().as_default():
        train_sets = dataset.get_datasets(main_path, EPIWidth, disp_precision,
                                          'train')
        test_sets = dataset.get_datasets(main_path, EPIWidth, disp_precision,
                                         'test')

        global_step = tf.Variable(0, trainable=False)

        images_placeholder_v = tf.placeholder(tf.float32,
                                              shape=(None, 9, EPIWidth, 1))
        images_placeholder_u = tf.placeholder(tf.float32,
                                              shape=(None, 9, EPIWidth, 1))
        labels_placeholder = tf.placeholder(tf.int32, shape=None)
        prop_placeholder = tf.placeholder('float')
        phase_train = tf.placeholder(tf.bool, name='phase_train')

        logits = network.inference_ds(images_placeholder_u,
                                      images_placeholder_v, prop_placeholder,
                                      phase_train, disp_precision)

        logits_softmax = network.softmax(logits)

        loss = network.loss(logits_softmax, labels_placeholder)

        train_op = network.training(loss, 1e-4, global_step)

        eval = network.evaluation(logits_softmax)

        summary = tf.summary.merge_all()

        saver = tf.train.Saver(tf.global_variables())

        gpu_option = tf.GPUOptions(per_process_gpu_memory_fraction=0.5)

        sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_option))

        summary_writer = tf.summary.FileWriter(summary_path, sess.graph)

        sess.run(tf.global_variables_initializer())

        ckpt = tf.train.get_checkpoint_state(checkpoint_path)
        if ckpt:
            #            saver.restore(sess,checkpoint_path+'/model.ckpt')#利用不同平台的训练结果
            #            saver.restore(sess, ckpt.model_checkpoint_path)  # 本地训练的结果
            print("restore from checkpoint!")
        else:
            print("no checkpoint found!")

        start_time = time.time()

        step = 0

        while not train_sets.complete:
            feed_dict = fill_feed_dict(train_sets, images_placeholder_u,
                                       images_placeholder_v,
                                       labels_placeholder, prop_placeholder,
                                       phase_train, 'train')
            _, loss_value = sess.run([train_op, loss], feed_dict=feed_dict)

            duration = time.time() - start_time
            if step % 1000 == 0:
                print('Step:%d: loss = %.2f (%.3f sec)' %
                      (step, loss_value, duration))
                summary_str = sess.run(summary, feed_dict=feed_dict)
                summary_writer.add_summary(summary_str, step)
                summary_writer.flush()

            if step % 25000 == 24999:
                print('test Data Eval:')
                do_eval_true(sess, eval, logits_softmax, images_placeholder_u,
                             images_placeholder_v, prop_placeholder,
                             phase_train, test_sets)

            if step % 50000 == 49999:
                saver.save(sess,
                           checkpoint_path + '/model.ckpt',
                           global_step=step)
Exemple #19
0
import dataset
ds = dataset.get_datasets('/home/fernand/math/data', 'test')
batch = [ds[0], ds[1]]
print(dataset.collate_data(batch))
Exemple #20
0
def main(arch: str,
         image_folder: str,
         batch_size: Optional[int] = None,
         from_model: Optional[str] = None,
         grad_accu: int = 1,
         steps: Optional[int] = None,
         num_gpus: int = 1,
         epochs: int = 1,
         lr: float = 4e-4):
    pl.seed_everything(int(os.environ.get("SEED", 738)))
    if arch.startswith("BiT"):
        base_model = BIT_MODELS[arch](head_size=-1)
        print("Loading pretrained model...")
        base_model.load_from(np.load(f"cache/pretrained/{arch}.npz"))
        net_final_size = base_model.width_factor * 2048
    else:
        raise ValueError(f"arch '{arch}'' not supported")
    train_ds, valid_ds = get_datasets(image_folder, val_ratio=0.05)

    set_trainable(base_model, False)
    model = FirstStageLearner(base_model,
                              train_ds,
                              valid_ds,
                              epochs,
                              lr,
                              num_gpus=num_gpus,
                              batch_size=batch_size if batch_size else 4,
                              image_size=IMAGE_SIZE,
                              projection_size=256,
                              projection_hidden_size=4096,
                              net_final_size=net_final_size,
                              use_momentum=False)

    if steps:
        trainer = pl.Trainer(
            accelerator='ddp' if num_gpus > 1 else None,
            amp_backend="apex",
            amp_level='O2',
            precision=16,
            gpus=num_gpus,
            val_check_interval=0.5,
            gradient_clip_val=10,
            max_steps=steps,
            accumulate_grad_batches=grad_accu,
            auto_scale_batch_size='power' if batch_size is None else None)
    else:
        trainer = pl.Trainer(
            accelerator='ddp' if num_gpus > 1 else None,
            amp_backend="apex",
            amp_level='O2',
            precision=16,
            gpus=num_gpus,
            val_check_interval=0.5,
            gradient_clip_val=10,
            max_epochs=epochs,
            accumulate_grad_batches=grad_accu,
            auto_scale_batch_size='power' if batch_size is None else None)

    if batch_size is None:
        trainer.tune(model)

    trainer.fit(model)

    if num_gpus == 1 or torch.distributed.get_rank() == 0:
        torch.save(
            {
                "online_encoder_proj":
                model.learner.online_encoder.projector.state_dict(),
                "online_encoder_net":
                model.learner.online_encoder.net.state_dict(),
                "online_predictor":
                model.learner.online_predictor.state_dict(),
                "target_encoder_net":
                model.learner.target_encoder.net.state_dict(),
                "target_encoder_proj":
                model.learner.target_encoder.projector.state_dict(),
                "config": {
                    "arch": arch
                }
            }, f"cache/byol_{arch}_warmed_up.pth")
        print("Model saved")