Esempio n. 1
0
def eval_tta3(config, augment, reporter):
    C.get()
    C.get().conf = config
    save_path = augment['save_path']
    cv_id, gr_id = augment["cv_id"], augment["gr_id"]
    gr_ids = augment["gr_ids"]

    # setup - provided augmentation rules
    C.get()['aug'] = policy_decoder(augment, augment['num_policy'],
                                    augment['num_op'])

    # eval
    model = get_model(C.get()['model'], num_class(C.get()['dataset']))
    ckpt = torch.load(save_path)
    if 'model' in ckpt:
        model.load_state_dict(ckpt['model'])
    else:
        model.load_state_dict(ckpt)
    del ckpt
    model.eval()

    loader = get_post_dataloader(C.get()["dataset"],
                                 C.get()['batch'], augment["dataroot"],
                                 augment['cv_ratio_test'], cv_id, gr_id,
                                 gr_ids)

    start_t = time.time()
    metrics = Accumulator()
    loss_fn = torch.nn.CrossEntropyLoss(reduction='none')
    for data, label in loader:
        data = data.cuda()
        label = label.cuda()

        pred = model(data)
        loss = loss_fn(pred, label)  # (N)

        _, pred = pred.topk(1, 1, True, True)
        pred = pred.t()
        correct = pred.eq(label.view(
            1, -1).expand_as(pred)).detach().cpu().numpy()  # (1,N)

        metrics.add_dict({
            'loss': np.sum(loss.detach().cpu().numpy()),
            'correct': np.sum(correct),
            'cnt': len(data)
        })
        del loss, correct, pred, data, label
    del model, loader
    metrics = metrics / 'cnt'
    gpu_secs = (time.time() - start_t) * torch.cuda.device_count()
    reporter(loss=metrics['loss'],
             top1_valid=metrics['correct'],
             elapsed_time=gpu_secs,
             done=True)
    return metrics['correct']
Esempio n. 2
0
def eval_tta2(config, augment, reporter):
    C.get()
    C.get().conf = config
    cv_ratio_test, cv_id, save_path = augment['cv_ratio_test'], augment['cv_id'], augment['save_path']
    gr_id = augment["gr_id"]
    num_repeat = 1

    # setup - provided augmentation rules
    C.get()['aug'] = policy_decoder(augment, augment['num_policy'], augment['num_op'])

    # eval
    model = get_model(C.get()['model'], num_class(C.get()['dataset']))
    ckpt = torch.load(save_path)
    if 'model' in ckpt:
        model.load_state_dict(ckpt['model'])
    else:
        model.load_state_dict(ckpt)
    model.eval()

    loaders = []
    for i in range(num_repeat):
        _, tl, validloader, tl2 = get_dataloaders(C.get()['dataset'], C.get()['batch'], augment['dataroot'], cv_ratio_test, split_idx=cv_id, gr_assign=augment["gr_assign"], gr_id=gr_id)
        loaders.append(validloader)
        del tl, tl2


    start_t = time.time()
    metrics = Accumulator()
    loss_fn = torch.nn.CrossEntropyLoss(reduction='none')
    for loader in loaders:
        for data, label in loader:
            data = data.cuda()
            label = label.cuda()

            pred = model(data)
            loss = loss_fn(pred, label) # (N)

            _, pred = pred.topk(1, 1, True, True)
            pred = pred.t()
            correct = pred.eq(label.view(1, -1).expand_as(pred)).detach().cpu().numpy() # (1,N)

            metrics.add_dict({
                'minus_loss': -1 * np.sum(loss.detach().cpu().numpy()),
                'correct': np.sum(correct),
                'cnt': len(data)
            })
            del loss, correct, pred, data, label
    del model
    metrics = metrics / 'cnt'
    gpu_secs = (time.time() - start_t) * torch.cuda.device_count()
    reporter(minus_loss=metrics['minus_loss'], top1_valid=metrics['correct'], elapsed_time=gpu_secs, done=True)
    return metrics['correct']
Esempio n. 3
0
def eval_tta(config, augment, reporter, num_class, get_model, get_dataloaders):
    C.get()
    C.get().conf = config
    cv_ratio_test, cv_fold, save_path = (
        augment["cv_ratio_test"],
        augment["cv_fold"],
        augment["save_path"],
    )

    # setup - provided augmentation rules
    C.get()["aug"] = policy_decoder(augment, augment["num_policy"], augment["num_op"])

    # eval
    model = get_model(C.get()["model"], num_class(C.get()["dataset"]))
    ckpt = torch.load(save_path)
    if "model" in ckpt:
        model.load_state_dict(ckpt["model"])
    else:
        model.load_state_dict(ckpt)
    model.eval()

    loaders = []
    for _ in range(augment["num_policy"]):  # TODO
        _, tl, validloader, tl2 = get_dataloaders(
            C.get()["dataset"],
            C.get()["batch"],
            augment["dataroot"],
            cv_ratio_test,
            split_idx=cv_fold,
        )
        loaders.append(iter(validloader))
        del tl, tl2

    start_t = time.time()
    metrics = Accumulator()
    loss_fn = torch.nn.CrossEntropyLoss(reduction="none")
    try:
        while True:
            losses = []
            corrects = []
            for loader in loaders:
                data, label = next(loader)
                data = data.cuda()
                label = label.cuda()

                pred = model(data)

                loss = loss_fn(pred, label)
                losses.append(loss.detach().cpu().numpy())

                _, pred = pred.topk(1, 1, True, True)
                pred = pred.t()
                correct = (
                    pred.eq(label.view(1, -1).expand_as(pred)).detach().cpu().numpy()
                )
                corrects.append(correct)
                del loss, correct, pred, data, label

            losses = np.concatenate(losses)
            losses_min = np.min(losses, axis=0).squeeze()

            corrects = np.concatenate(corrects)
            corrects_max = np.max(corrects, axis=0).squeeze()
            metrics.add_dict(
                {
                    "minus_loss": -1 * np.sum(losses_min),
                    "correct": np.sum(corrects_max),
                    "cnt": len(corrects_max),
                }
            )
            del corrects, corrects_max
    except StopIteration:
        pass

    del model
    metrics = metrics / "cnt"
    gpu_secs = (time.time() - start_t) * torch.cuda.device_count()
    reporter(
        minus_loss=metrics["minus_loss"],
        top1_valid=metrics["correct"],
        elapsed_time=gpu_secs,
        done=True,
    )
    return metrics["correct"]
Esempio n. 4
0
def main():
    w = PyStopwatch()

    parser = ConfigArgumentParser(conflict_handler="resolve")
    parser.add_argument(
        "--dataroot",
        type=str,
        default="/data/private/pretrainedmodels",
        help="torchvision data folder",
    )
    parser.add_argument("--until", type=int, default=5)
    parser.add_argument("--num-op", type=int, default=2)
    parser.add_argument("--num-policy", type=int, default=5)
    parser.add_argument("--num-search", type=int, default=200)
    parser.add_argument("--cv-ratio", type=float, default=0.4)
    parser.add_argument("--decay", type=float, default=-1)
    parser.add_argument("--redis", type=str, default="gpu-cloud-vnode30.dakao.io:23655")
    parser.add_argument("--per-class", action="store_true")
    parser.add_argument("--resume", action="store_true")
    parser.add_argument("--smoke-test", action="store_true")
    args = parser.parse_args()

    if args.decay > 0:
        logger.info("decay=%.4f" % args.decay)
        C.get()["optimizer"]["decay"] = args.decay

    add_filehandler(
        logger,
        os.path.join(
            "models",
            "%s_%s_cv%.1f.log"
            % (C.get()["dataset"], C.get()["model"]["type"], args.cv_ratio),
        ),
    )
    logger.info("configuration...")
    logger.info(json.dumps(C.get().conf, sort_keys=True, indent=4))
    logger.info("initialize ray...")
    ray.init(address=args.redis)

    num_result_per_cv = 10
    cv_num = 5
    copied_c = copy.deepcopy(C.get().conf)

    logger.info(
        "search augmentation policies, dataset=%s model=%s"
        % (C.get()["dataset"], C.get()["model"]["type"])
    )
    logger.info(
        "----- Train without Augmentations cv=%d ratio(test)=%.1f -----"
        % (cv_num, args.cv_ratio)
    )
    w.start(tag="train_no_aug")
    paths = [
        _get_path(
            C.get()["dataset"],
            C.get()["model"]["type"],
            "ratio%.1f_fold%d" % (args.cv_ratio, i),
        )
        for i in range(cv_num)
    ]
    print(paths)
    reqs = [
        train_model.remote(
            copy.deepcopy(copied_c),
            args.dataroot,
            C.get()["aug"],
            args.cv_ratio,
            i,
            save_path=paths[i],
            skip_exist=True,
        )
        for i in range(cv_num)
    ]

    tqdm_epoch = tqdm(range(C.get()["epoch"]))
    is_done = False
    for epoch in tqdm_epoch:
        while True:
            epochs_per_cv = OrderedDict()
            for cv_idx in range(cv_num):
                try:
                    latest_ckpt = torch.load(paths[cv_idx])
                    if "epoch" not in latest_ckpt:
                        epochs_per_cv["cv%d" % (cv_idx + 1)] = C.get()["epoch"]
                        continue
                    epochs_per_cv["cv%d" % (cv_idx + 1)] = latest_ckpt["epoch"]
                except Exception as e:
                    continue
            tqdm_epoch.set_postfix(epochs_per_cv)
            if (
                len(epochs_per_cv) == cv_num
                and min(epochs_per_cv.values()) >= C.get()["epoch"]
            ):
                is_done = True
            if len(epochs_per_cv) == cv_num and min(epochs_per_cv.values()) >= epoch:
                break
            time.sleep(10)
        if is_done:
            break

    logger.info("getting results...")
    pretrain_results = ray.get(reqs)
    for r_model, r_cv, r_dict in pretrain_results:
        logger.info(
            "model=%s cv=%d top1_train=%.4f top1_valid=%.4f"
            % (r_model, r_cv + 1, r_dict["top1_train"], r_dict["top1_valid"])
        )
    logger.info("processed in %.4f secs" % w.pause("train_no_aug"))

    if args.until == 1:
        sys.exit(0)

    logger.info("----- Search Test-Time Augmentation Policies -----")
    w.start(tag="search")

    ops = augment_list(False)
    space = {}
    for i in range(args.num_policy):
        for j in range(args.num_op):
            space["policy_%d_%d" % (i, j)] = hp.choice(
                "policy_%d_%d" % (i, j), list(range(0, len(ops)))
            )
            space["prob_%d_%d" % (i, j)] = hp.uniform("prob_%d_ %d" % (i, j), 0.0, 1.0)
            space["level_%d_%d" % (i, j)] = hp.uniform(
                "level_%d_ %d" % (i, j), 0.0, 1.0
            )

    final_policy_set = []
    total_computation = 0
    reward_attr = "top1_valid"  # top1_valid or minus_loss
    for _ in range(1):  # run multiple times.
        for cv_fold in range(cv_num):
            name = "search_%s_%s_fold%d_ratio%.1f" % (
                C.get()["dataset"],
                C.get()["model"]["type"],
                cv_fold,
                args.cv_ratio,
            )
            print(name)

            # def train(augs, rpt):
            def train(config, reporter):
                return eval_tta(
                    copy.deepcopy(copied_c), config, reporter, num_class, get_model, get_dataloaders
                )

            register_trainable(name, train)
            algo = HyperOptSearch(
                space, max_concurrent=4 * 20, metric=reward_attr, mode="max"
            )

            results = run(
                train,
                name=name,
                config={
                    "dataroot": args.dataroot,
                    "save_path": paths[cv_fold],
                    "cv_ratio_test": args.cv_ratio,
                    "cv_fold": cv_fold,
                    "num_op": args.num_op,
                    "num_policy": args.num_policy,
                },
                num_samples=4 if args.smoke_test else args.num_search,
                resources_per_trial={"gpu": 1},
                stop={"training_iteration": args.num_policy},
                search_alg=algo,
                scheduler=None,
                verbose=0,
                queue_trials=True,
                resume=args.resume,
                raise_on_failed_trial=False,
            )
            print()
            df = results.results_df

            import pickle

            with open("results.pickle", "wb") as fp:
                pickle.dump(results, fp)
            df.to_csv("df.csv")

            results = df.sort_values(by=reward_attr, ascending=False)
            # results = [x for x in results if x.last_result is not None]
            # results = sorted(results, key=lambda x: x.last_result[reward_attr], reverse=True)

            # calculate computation usage
            for _, result in results.iterrows():
                total_computation += result["elapsed_time"]

            for _, result in results.iloc[:num_result_per_cv].iterrows():
                final_policy = policy_decoder(
                    result, args.num_policy, args.num_op, prefix="config."
                )
                logger.info(
                    "loss=%.12f top1_valid=%.4f %s"
                    % (result["minus_loss"], result["top1_valid"], final_policy)
                )

                final_policy = remove_deplicates(final_policy)
                final_policy_set.extend(final_policy)

    logger.info(json.dumps(final_policy_set))
    logger.info("final_policy=%d" % len(final_policy_set))
    logger.info(
        "processed in %.4f secs, gpu hours=%.4f"
        % (w.pause("search"), total_computation / 3600.0)
    )
    logger.info(
        "----- Train with Augmentations model=%s dataset=%s aug=%s ratio(test)=%.1f -----"
        % (C.get()["model"]["type"], C.get()["dataset"], C.get()["aug"], args.cv_ratio)
    )
    w.start(tag="train_aug")

    num_experiments = 5
    default_path = [
        _get_path(
            C.get()["dataset"],
            C.get()["model"]["type"],
            "ratio%.1f_default%d" % (args.cv_ratio, _),
        )
        for _ in range(num_experiments)
    ]
    augment_path = [
        _get_path(
            C.get()["dataset"],
            C.get()["model"]["type"],
            "ratio%.1f_augment%d" % (args.cv_ratio, _),
        )
        for _ in range(num_experiments)
    ]
    reqs = [
        train_model.remote(
            copy.deepcopy(copied_c),
            args.dataroot,
            C.get()["aug"],
            0.0,
            0,
            save_path=default_path[_],
            skip_exist=True,
        )
        for _ in range(num_experiments)
    ] + [
        train_model.remote(
            copy.deepcopy(copied_c),
            args.dataroot,
            final_policy_set,
            0.0,
            0,
            save_path=augment_path[_],
        )
        for _ in range(num_experiments)
    ]

    tqdm_epoch = tqdm(range(C.get()["epoch"]))
    is_done = False
    for epoch in tqdm_epoch:
        while True:
            epochs = OrderedDict()
            for exp_idx in range(num_experiments):
                try:
                    if os.path.exists(default_path[exp_idx]):
                        latest_ckpt = torch.load(default_path[exp_idx])
                        epochs["default_exp%d" % (exp_idx + 1)] = latest_ckpt["epoch"]
                except:
                    pass
                try:
                    if os.path.exists(augment_path[exp_idx]):
                        latest_ckpt = torch.load(augment_path[exp_idx])
                        epochs["augment_exp%d" % (exp_idx + 1)] = latest_ckpt["epoch"]
                except:
                    pass

            tqdm_epoch.set_postfix(epochs)
            if (
                len(epochs) == num_experiments * 2
                and min(epochs.values()) >= C.get()["epoch"]
            ):
                is_done = True
            if len(epochs) == num_experiments * 2 and min(epochs.values()) >= epoch:
                break
            time.sleep(10)
        if is_done:
            break

    logger.info("getting results...")
    final_results = ray.get(reqs)

    for train_mode in ["default", "augment"]:
        avg = 0.0
        for _ in range(num_experiments):
            r_model, r_cv, r_dict = final_results.pop(0)
            logger.info(
                "[%s] top1_train=%.4f top1_test=%.4f"
                % (train_mode, r_dict["top1_train"], r_dict["top1_test"])
            )
            avg += r_dict["top1_test"]
        avg /= num_experiments
        logger.info(
            "[%s] top1_test average=%.4f (#experiments=%d)"
            % (train_mode, avg, num_experiments)
        )
    logger.info("processed in %.4f secs" % w.pause("train_aug"))

    logger.info(w)
Esempio n. 5
0
                                      queue_trials=True,
                                      resume=args.resume,
                                      raise_on_failed_trial=False)  # 参数过时了
            #results = run(exp_config, search_alg=algo, scheduler=None, verbose=0, queue_trials=True, resume=args.resume, raise_on_failed_trial=False)

            results = [x for x in results if x.last_result is not None]
            results = sorted(results,
                             key=lambda x: x.last_result[reward_attr],
                             reverse=True)

            # calculate computation usage
            for result in results:
                total_computation += result.last_result['elapsed_time']

            for result in results[:num_result_per_cv]:
                final_policy = policy_decoder(result.config, args.num_policy,
                                              args.num_op)
                logger.info('loss=%.12f top1_valid=%.4f %s' %
                            (result.last_result['minus_loss'],
                             result.last_result['top1_valid'], final_policy))

                final_policy = remove_deplicates(final_policy)
                final_policy_set.extend(final_policy)

    logger.info(json.dumps(final_policy_set))
    logger.info('final_policy=%d' % len(final_policy_set))
    w.pause('search')
    print(w)
    #logger.info('processed in %.4f secs, gpu hours=%.4f' % (w.pause('search'), total_computation / 3600.))
    logger.info(
        '----- Train with Augmentations model=%s dataset=%s aug=%s ratio(test)=%.1f -----'
        % (C.get()['model']['type'], C.get()['dataset'], C.get()['aug'],
Esempio n. 6
0
def eval_tta(config, augment):
    C.get()
    C.get().conf = config
    cv_ratio_test, cv_fold, save_path = augment['cv_ratio_test'], augment[
        'cv_fold'], augment['save_path']
    print(augment)
    # setup - provided augmentation rules
    C.get()['aug'] = policy_decoder(augment, augment['num_policy'],
                                    augment['num_op'])

    # eval
    model = get_model(C.get()['model'], num_class(C.get()['dataset']))
    ckpt = torch.load(save_path)
    if 'model' in ckpt:
        model.load_state_dict(ckpt['model'])
    else:
        model.load_state_dict(ckpt)
    model.eval()

    loaders = []
    for _ in range(augment['num_policy']):  # TODO
        _, tl, validloader, tl2 = get_dataloaders(C.get()['dataset'],
                                                  C.get()['batch'],
                                                  augment['dataroot'],
                                                  cv_ratio_test,
                                                  split_idx=cv_fold)
        loaders.append(iter(validloader))
        del tl, tl2

    start_t = time.time()
    metrics = Accumulator()
    loss_fn = torch.nn.CrossEntropyLoss(reduction='none')
    try:
        while True:
            losses = []
            corrects = []
            for loader in loaders:
                data, label = next(loader)
                data = data.cuda()
                label = label.cuda()

                pred = model(data)

                loss = loss_fn(pred, label)
                losses.append(loss.detach().cpu().numpy())

                _, pred = pred.topk(1, 1, True, True)
                pred = pred.t()
                correct = pred.eq(label.view(
                    1, -1).expand_as(pred)).detach().cpu().numpy()
                corrects.append(correct)
                del loss, correct, pred, data, label

            losses = np.concatenate(losses)
            losses_min = np.min(losses, axis=0).squeeze()

            corrects = np.concatenate(corrects)
            corrects_max = np.max(corrects, axis=0).squeeze()
            metrics.add_dict({
                'minus_loss': -1 * np.sum(losses_min),
                'correct': np.sum(corrects_max),
                'cnt': len(corrects_max)
            })
            del corrects, corrects_max
    except StopIteration:
        pass

    del model
    metrics = metrics / 'cnt'
    gpu_secs = (time.time() - start_t) * torch.cuda.device_count()
    # reporter(minus_loss=metrics['minus_loss'], top1_valid=metrics['correct'], elapsed_time=gpu_secs, done=True)
    tune.track.log(minus_loss=metrics['minus_loss'],
                   top1_valid=metrics['correct'],
                   elapsed_time=gpu_secs,
                   done=True)
    return metrics['correct']
Esempio n. 7
0
                          config=aug_config,
                          num_samples=num_samples,
                          resources_per_trial={'gpu': 1},
                          stop={'training_iteration': args.num_policy})
            dataframe = results.dataframe().sort_values(reward_attr,
                                                        ascending=False)
            total_computation = dataframe['elapsed_time'].sum()
            for i in range(num_result_per_cv):
                config_dict = dataframe.loc[i].filter(like='config').to_dict()
                new_keys = [
                    x.replace('config/', '') for x in config_dict.keys()
                ]
                new_config_dict = {}
                for key in new_keys:
                    new_config_dict[key] = config_dict['config/' + key]
                final_policy = policy_decoder(new_config_dict, args.num_policy,
                                              args.num_op)
                logger.info(
                    'loss=%.12f top1_valid=%.4f %s' %
                    (dataframe.loc[i]['minus_loss'].item(),
                     dataframe.loc[i]['top1_valid'].item(), final_policy))

                final_policy = remove_deplicates(final_policy)
                final_policy_set.extend(final_policy)

    logger.info(json.dumps(final_policy_set))
    logger.info('final_policy=%d' % len(final_policy_set))
    logger.info('processed in %.4f secs, gpu hours=%.4f' %
                (w.pause('search'), total_computation / 3600.))
    logger.info(
        '----- Train with Augmentations model=%s dataset=%s aug=%s ratio(test)=%.1f -----'
        % (C.get()['model']['type'], C.get()['dataset'], C.get()['aug'],
Esempio n. 8
0
def search_aug_policy(copied_conf, cv_ratio, num_fold, num_result_per_fold,
                      num_policy, num_op, smoke_test, num_search,
                      resume) -> list:
    global MODEL_PATHS, DATASET_ROOT
    global logger, watcher
    logger.info(
        '----- [Phase 2.] Search Test-Time Augmentation Policies -----')
    watcher.start(tag='search')

    ops = augment_list(False)
    space = {}
    for i in range(num_policy):
        for j in range(num_op):
            space['policy_%d_%d' % (i, j)] = hp.choice(
                'policy_%d_%d' % (i, j), list(range(0, len(ops))))
            space['prob_%d_%d' % (i, j)] = hp.uniform('prob_%d_ %d' % (i, j),
                                                      0.0, 1.0)
            space['level_%d_%d' % (i, j)] = hp.uniform('level_%d_ %d' % (i, j),
                                                       0.0, 1.0)

    final_policy_set = []
    total_computation = 0
    reward_attr = 'top1_valid'  # top1_valid or minus_loss
    for _ in range(1):  # run multiple times.
        for cv_fold in range(num_fold):
            name = "search_%s_%s_fold%d_ratio%.1f" % (Config.get(
            )['dataset'], Config.get()['model']['type'], cv_fold, cv_ratio)
            print(name)
            register_trainable(name, lambda augs, rpt: eval_tta(
                copy.deepcopy(copied_conf), augs, rpt)
                               )  # augs: a dict, just like the 'exp_config'
            algo = HyperOptSearch(space,
                                  max_concurrent=4 * 20,
                                  reward_attr=reward_attr)

            exp_config = {
                name: {
                    'run': name,
                    'num_samples': 4 if smoke_test else num_search,
                    'resources_per_trial': {
                        'gpu': 1
                    },
                    'stop': {
                        'training_iteration': num_policy
                    },
                    'config': {
                        'dataroot': DATASET_ROOT,
                        'save_path': MODEL_PATHS[cv_fold],
                        'cv_ratio_test': cv_ratio,
                        'cv_fold': cv_fold,
                        'num_op': num_op,
                        'num_policy': num_policy
                    },
                }
            }
            results = run_experiments(exp_config,
                                      search_alg=algo,
                                      scheduler=None,
                                      verbose=0,
                                      queue_trials=True,
                                      resume=resume,
                                      raise_on_failed_trial=False)
            print()
            results = [x for x in results if x.last_result is not None]
            results = sorted(results,
                             key=lambda x: x.last_result[reward_attr],
                             reverse=True)

            # calculate computation usage
            for result in results:
                total_computation += result.last_result['elapsed_time']

            for result in results[:num_result_per_fold]:
                final_policy = policy_decoder(result.config, num_policy,
                                              num_op)
                logger.info('loss=%.12f top1_valid=%.4f %s' %
                            (result.last_result['minus_loss'],
                             result.last_result['top1_valid'], final_policy))

                final_policy = remove_deplicates(final_policy)
                final_policy_set.extend(final_policy)

    logger.info(json.dumps(final_policy_set))
    logger.info('final_policy=%d' % len(final_policy_set))
    logger.info('processed in %.4f secs, gpu hours=%.4f' %
                (watcher.pause('search'), total_computation / 3600.))

    return final_policy_set
Esempio n. 9
0
    def __init__(self, args=None, paths_ls=None):
        if args is None:
            d = yaml.load(open(
                '/home/noam/ZazuML/augmentations_tuner/fastautoaugment/confs/resnet50.yaml'
            ),
                          Loader=yaml.FullLoader)
            from argparse import Namespace
            args = Namespace(**d)
        args.redis = 'gpu-cloud-vnode30.dakao.io:23655'
        args.per_class = True
        args.resume = True
        args.smoke_test = True

        if args.decay > 0:
            logger.info('decay=%.4f' % args.decay)
            C.get()['optimizer']['decay'] = args.decay

        add_filehandler(
            logger,
            os.path.join(
                'FastAutoAugment/models', '%s_%s_cv%.1f.log' %
                (C.get()['dataset'], C.get()['model']['type'], args.cv_ratio)))

        logger.info('initialize ray...')
        ray.init(num_cpus=1, num_gpus=1)

        num_result_per_cv = 10 if not args.smoke_test else 2
        cv_num = 5 if paths_ls is None else len(paths_ls)
        args.version = 1
        args._timestamp = '2020/08/30 20:40:10'
        args.config = '/home/noam/ZazuML/augmentations_tuner/fastautoaugment/confs/resnet50.yaml'

        copied_c = copy.deepcopy(args)
        self.copied_c = copied_c

        logger.info('search augmentation policies, dataset=%s model=%s' %
                    (C.get()['dataset'], C.get()['model']['type']))
        logger.info(
            '----- Train without Augmentations ratio(test)=%.1f -----' %
            (args.cv_ratio))
        w.start(tag='train_no_aug')
        if paths_ls is None:
            paths_ls = [
                _get_path(C.get()['dataset'],
                          C.get()['model']['type'],
                          'ratio%.1f_fold%d' % (args.cv_ratio, i))
                for i in range(cv_num)
            ]
            print(paths_ls)
            logger.info('getting results...')
            pretrain_results = [
                train_model(copy.deepcopy(copied_c),
                            args.dataroot,
                            C.get()['aug'],
                            args.cv_ratio,
                            i,
                            save_path=paths_ls[i],
                            skip_exist=args.smoke_test) for i in range(cv_num)
            ]

        for r_model, r_cv, r_dict in pretrain_results:
            logger.info('model=%s cv=%d top1_train=%.4f top1_valid=%.4f' %
                        (r_model, r_cv + 1, r_dict['top1_train'],
                         r_dict['top1_valid']))
        logger.info('processed in %.4f secs' % w.pause('train_no_aug'))

        if args.until == 1:
            sys.exit(0)

        logger.info('----- Search Test-Time Augmentation Policies -----')
        w.start(tag='search')

        ops = augment_list(False)
        space = {}
        for i in range(args.num_policy):
            for j in range(args.num_op):
                space['policy_%d_%d' % (i, j)] = hp.choice(
                    'policy_%d_%d' % (i, j), list(range(0, len(ops))))
                space['prob_%d_%d' % (i, j)] = hp.uniform(
                    'prob_%d_ %d' % (i, j), 0.0, 1.0)
                space['level_%d_%d' % (i, j)] = hp.uniform(
                    'level_%d_ %d' % (i, j), 0.0, 1.0)

        def eval_t(augs):
            print(augs)
            return eval_tta(copy.deepcopy(copied_c), augs)

        final_policy_set = []
        total_computation = 0
        reward_attr = 'top1_valid'  # top1_valid or minus_loss
        for _ in range(1):  # run multiple times.
            for cv_fold in range(cv_num):
                name = "search_%s_%s_fold%d_ratio%.1f" % (C.get(
                )['dataset'], C.get()['model']['type'], cv_fold, args.cv_ratio)
                print(name)
                algo = HyperOptSearch(space,
                                      max_concurrent=1,
                                      metric=reward_attr)
                aug_config = {
                    'dataroot': args.dataroot,
                    'save_path': paths_ls[cv_fold],
                    'cv_ratio_test': args.cv_ratio,
                    'cv_fold': cv_fold,
                    'num_op': args.num_op,
                    'num_policy': args.num_policy
                }
                num_samples = 4 if args.smoke_test else args.num_search
                print(aug_config)
                eval_t(aug_config)
                results = run(eval_t,
                              search_alg=algo,
                              config=aug_config,
                              num_samples=num_samples,
                              resources_per_trial={'gpu': 1},
                              stop={'training_iteration': args.num_policy})
                dataframe = results.dataframe().sort_values(reward_attr,
                                                            ascending=False)
                total_computation = dataframe['elapsed_time'].sum()
                for i in range(num_result_per_cv):
                    config_dict = dataframe.loc[i].filter(
                        like='config').to_dict()
                    new_keys = [
                        x.replace('config/', '') for x in config_dict.keys()
                    ]
                    new_config_dict = {}
                    for key in new_keys:
                        new_config_dict[key] = config_dict['config/' + key]
                    final_policy = policy_decoder(new_config_dict,
                                                  args.num_policy, args.num_op)
                    logger.info(
                        'loss=%.12f top1_valid=%.4f %s' %
                        (dataframe.loc[i]['minus_loss'].item(),
                         dataframe.loc[i]['top1_valid'].item(), final_policy))

                    final_policy = remove_deplicates(final_policy)
                    final_policy_set.extend(final_policy)

        logger.info(json.dumps(final_policy_set))
        logger.info('final_policy=%d' % len(final_policy_set))
        logger.info('processed in %.4f secs, gpu hours=%.4f' %
                    (w.pause('search'), total_computation / 3600.))
        logger.info(
            '----- Train with Augmentations model=%s dataset=%s aug=%s ratio(test)=%.1f -----'
            % (C.get()['model']['type'], C.get()['dataset'], C.get()['aug'],
               args.cv_ratio))
        w.start(tag='train_aug')
        self.final_policy_set = final_policy_set
        self.args = args
        self.paths_ls = paths_ls
Esempio n. 10
0
def eval_tta(config, augment, reporter):
    C.get()
    C.get().conf = config
    save_path = augment['save_path']
    cv_id, gr_id = augment["cv_id"], augment["gr_id"]
    gr_ids = augment["gr_ids"]

    # setup - provided augmentation rules
    C.get()['aug'] = policy_decoder(augment, augment['num_policy'],
                                    augment['num_op'])

    # eval
    model = get_model(C.get()['model'], num_class(C.get()['dataset']))
    ckpt = torch.load(save_path)
    if 'model' in ckpt:
        model.load_state_dict(ckpt['model'])
    else:
        model.load_state_dict(ckpt)
    model.eval()

    loaders = []
    for _ in range(augment['num_policy']):  # TODO
        loader = get_post_dataloader(C.get()["dataset"],
                                     C.get()['batch'], augment["dataroot"],
                                     augment['cv_ratio_test'], cv_id, gr_id,
                                     gr_ids)
        loaders.append(iter(loader))

    start_t = time.time()
    metrics = Accumulator()
    loss_fn = torch.nn.CrossEntropyLoss(reduction='none')
    try:
        while True:
            losses = []
            corrects = []
            for loader in loaders:
                data, label = next(loader)
                data = data.cuda()
                label = label.cuda()

                pred = model(data)

                loss = loss_fn(pred, label)
                losses.append(loss.detach().cpu().numpy().reshape(1,
                                                                  -1))  # (1,N)

                _, pred = pred.topk(1, 1, True, True)
                pred = pred.t()
                correct = pred.eq(label.view(
                    1, -1).expand_as(pred)).detach().cpu().numpy()  # (1,N)
                corrects.append(correct)
                del loss, correct, pred, data, label

            losses = np.concatenate(losses)
            losses_min = np.min(losses, axis=0).squeeze()  # (N,)

            corrects = np.concatenate(corrects)
            corrects_max = np.max(corrects, axis=0).squeeze()  # (N,)
            metrics.add_dict({
                'loss': np.sum(losses_min),
                'correct': np.sum(corrects_max),
                'cnt': corrects_max.size
            })
            del corrects, corrects_max
    except StopIteration:
        pass

    del model
    metrics = metrics / 'cnt'
    gpu_secs = (time.time() - start_t) * torch.cuda.device_count()
    reporter(loss=metrics['loss'],
             top1_valid=metrics['correct'],
             elapsed_time=gpu_secs,
             done=True)
    return metrics['correct']
Esempio n. 11
0
def search(args, paths=None):
    args.redis = 'gpu-cloud-vnode30.dakao.io:23655'
    args.per_class = True
    args.resume = True
    args.smoke_test = True

    if args.decay > 0:
        logger.info('decay=%.4f' % args.decay)
        C.get()['optimizer']['decay'] = args.decay

    add_filehandler(
        logger,
        os.path.join(
            'FastAutoAugment/models', '%s_%s_cv%.1f.log' %
            (C.get()['dataset'], C.get()['model']['type'], args.cv_ratio)))
    logger.info('configuration...')
    logger.info(json.dumps(C.get().conf, sort_keys=True, indent=4))
    logger.info('initialize ray...')
    ray.init(num_cpus=1, num_gpus=1)

    num_result_per_cv = 10 if not args.smoke_test else 2
    cv_num = 5
    copied_c = copy.deepcopy(C.get().conf)

    logger.info('search augmentation policies, dataset=%s model=%s' %
                (C.get()['dataset'], C.get()['model']['type']))
    logger.info(
        '----- Train without Augmentations cv=%d ratio(test)=%.1f -----' %
        (cv_num, args.cv_ratio))
    w.start(tag='train_no_aug')
    if paths == None:
        paths = [
            _get_path(C.get()['dataset'],
                      C.get()['model']['type'],
                      'ratio%.1f_fold%d' % (args.cv_ratio, i))
            for i in range(cv_num)
        ]
    print(paths)
    tqdm_epoch = tqdm(range(C.get()['epoch']))

    logger.info('getting results...')
    # pretrain_results = [
    #     train_model(copy.deepcopy(copied_c), args.dataroot, C.get()['aug'], args.cv_ratio, i, save_path=paths[i],
    #                 skip_exist=True) for i in range(cv_num)]
    pretrain_results = [
        train_model(copy.deepcopy(copied_c),
                    args.dataroot,
                    C.get()['aug'],
                    args.cv_ratio,
                    i,
                    save_path=paths[i]) for i in range(cv_num)
    ]
    for r_model, r_cv, r_dict in pretrain_results:
        logger.info(
            'model=%s cv=%d top1_train=%.4f top1_valid=%.4f' %
            (r_model, r_cv + 1, r_dict['top1_train'], r_dict['top1_valid']))
    logger.info('processed in %.4f secs' % w.pause('train_no_aug'))

    if args.until == 1:
        sys.exit(0)

    logger.info('----- Search Test-Time Augmentation Policies -----')
    w.start(tag='search')

    ops = augment_list(False)
    space = {}
    for i in range(args.num_policy):
        for j in range(args.num_op):
            space['policy_%d_%d' % (i, j)] = hp.choice(
                'policy_%d_%d' % (i, j), list(range(0, len(ops))))
            space['prob_%d_%d' % (i, j)] = hp.uniform('prob_%d_ %d' % (i, j),
                                                      0.0, 1.0)
            space['level_%d_%d' % (i, j)] = hp.uniform('level_%d_ %d' % (i, j),
                                                       0.0, 1.0)

    def eval_t(augs):
        print(augs)
        return eval_tta(copy.deepcopy(copied_c), augs)

    final_policy_set = []
    total_computation = 0
    reward_attr = 'top1_valid'  # top1_valid or minus_loss
    for _ in range(1):  # run multiple times.
        for cv_fold in range(cv_num):
            name = "search_%s_%s_fold%d_ratio%.1f" % (C.get()['dataset'],
                                                      C.get()['model']['type'],
                                                      cv_fold, args.cv_ratio)
            print(name)
            algo = HyperOptSearch(space, max_concurrent=1, metric=reward_attr)
            aug_config = {
                'dataroot': args.dataroot,
                'save_path': paths[cv_fold],
                'cv_ratio_test': args.cv_ratio,
                'cv_fold': cv_fold,
                'num_op': args.num_op,
                'num_policy': args.num_policy
            }
            num_samples = 4 if args.smoke_test else args.num_search
            print(aug_config)
            eval_t(aug_config)
            results = run(eval_t,
                          search_alg=algo,
                          config=aug_config,
                          num_samples=num_samples,
                          resources_per_trial={'gpu': 1},
                          stop={'training_iteration': args.num_policy})
            dataframe = results.dataframe().sort_values(reward_attr,
                                                        ascending=False)
            total_computation = dataframe['elapsed_time'].sum()
            for i in range(num_result_per_cv):
                config_dict = dataframe.loc[i].filter(like='config').to_dict()
                new_keys = [
                    x.replace('config/', '') for x in config_dict.keys()
                ]
                new_config_dict = {}
                for key in new_keys:
                    new_config_dict[key] = config_dict['config/' + key]
                final_policy = policy_decoder(new_config_dict, args.num_policy,
                                              args.num_op)
                logger.info(
                    'loss=%.12f top1_valid=%.4f %s' %
                    (dataframe.loc[i]['minus_loss'].item(),
                     dataframe.loc[i]['top1_valid'].item(), final_policy))

                final_policy = remove_deplicates(final_policy)
                final_policy_set.extend(final_policy)

    logger.info(json.dumps(final_policy_set))
    logger.info('final_policy=%d' % len(final_policy_set))
    logger.info('processed in %.4f secs, gpu hours=%.4f' %
                (w.pause('search'), total_computation / 3600.))
    logger.info(
        '----- Train with Augmentations model=%s dataset=%s aug=%s ratio(test)=%.1f -----'
        % (C.get()['model']['type'], C.get()['dataset'], C.get()['aug'],
           args.cv_ratio))
    w.start(tag='train_aug')

    num_experiments = 5
    default_path = [
        _get_path(C.get()['dataset'],
                  C.get()['model']['type'],
                  'ratio%.1f_default%d' % (args.cv_ratio, _))
        for _ in range(num_experiments)
    ]
    augment_path = [
        _get_path(C.get()['dataset'],
                  C.get()['model']['type'],
                  'ratio%.1f_augment%d' % (args.cv_ratio, _))
        for _ in range(num_experiments)
    ]
    tqdm_epoch = tqdm(range(C.get()['epoch']))
    is_done = False
    for epoch in tqdm_epoch:
        while True:
            epochs = OrderedDict()
            for exp_idx in range(num_experiments):
                try:
                    if os.path.exists(default_path[exp_idx]):
                        latest_ckpt = torch.load(default_path[exp_idx])
                        if 'epoch' not in latest_ckpt:
                            epochs['default_exp%d' %
                                   (exp_idx + 1)] = C.get()['epoch']
                        else:
                            epochs['default_exp%d' %
                                   (exp_idx + 1)] = latest_ckpt['epoch']
                except Exception as e:
                    pass
                try:
                    if os.path.exists(augment_path[exp_idx]):
                        latest_ckpt = torch.load(augment_path[exp_idx])
                        if 'epoch' not in latest_ckpt:
                            epochs['augment_exp%d' %
                                   (exp_idx + 1)] = C.get()['epoch']
                        else:
                            epochs['augment_exp%d' %
                                   (exp_idx + 1)] = latest_ckpt['epoch']
                except:
                    pass

            tqdm_epoch.set_postfix(epochs)
            if len(epochs) == num_experiments * 2 and min(
                    epochs.values()) >= C.get()['epoch']:
                is_done = True
            if len(epochs) == num_experiments * 2 and min(
                    epochs.values()) >= epoch:
                break
            time.sleep(10, '-- sleeping for 10 seconds --')
        if is_done:
            break

    logger.info('getting results...')
    final_results = [train_model(copy.deepcopy(copied_c), args.dataroot, C.get()['aug'], 0.0, 0,
                                 save_path=default_path[_], skip_exist=True) for _ in range(num_experiments)] + \
                    [train_model(copy.deepcopy(copied_c), args.dataroot, final_policy_set, 0.0, 0,
                                 save_path=augment_path[_]) for _ in range(num_experiments)]

    for train_mode in ['default', 'augment']:
        avg = 0.
        for _ in range(num_experiments):
            r_model, r_cv, r_dict = final_results.pop(0)
            logger.info(
                '[%s] top1_train=%.4f top1_test=%.4f' %
                (train_mode, r_dict['top1_train'], r_dict['top1_test']))
            avg += r_dict['top1_test']
        avg /= num_experiments
        logger.info('[%s] top1_test average=%.4f (#experiments=%d)' %
                    (train_mode, avg, num_experiments))
    logger.info('processed in %.4f secs' % w.pause('train_aug'))

    logger.info(w)