Beispiel #1
0
def parse_args(cloud_args=None):
    """parameters"""
    parser = argparse.ArgumentParser('mindspore classification training')
    parser.add_argument('--platform', type=str, default='Ascend', choices=('Ascend', 'GPU'), help='run platform')

    # dataset related
    parser.add_argument('--data_dir', type=str, default='', help='train data dir')
    parser.add_argument('--per_batch_size', default=128, type=int, help='batch size for per gpu')
    # network related
    parser.add_argument('--pretrained', default='', type=str, help='model_path, local pretrained model to load')

    # distributed related
    parser.add_argument('--is_distributed', type=int, default=1, help='if multi device')
    # roma obs
    parser.add_argument('--train_url', type=str, default="", help='train url')

    args, _ = parser.parse_known_args()
    args = merge_args(args, cloud_args)
    args.image_size = config.image_size
    args.num_classes = config.num_classes
    args.lr = config.lr
    args.lr_scheduler = config.lr_scheduler
    args.lr_epochs = config.lr_epochs
    args.lr_gamma = config.lr_gamma
    args.eta_min = config.eta_min
    args.T_max = config.T_max
    args.max_epoch = config.max_epoch
    args.warmup_epochs = config.warmup_epochs
    args.weight_decay = config.weight_decay
    args.momentum = config.momentum
    args.is_dynamic_loss_scale = config.is_dynamic_loss_scale
    args.loss_scale = config.loss_scale
    args.label_smooth = config.label_smooth
    args.label_smooth_factor = config.label_smooth_factor
    args.ckpt_interval = config.ckpt_interval
    args.ckpt_save_max = config.ckpt_save_max
    args.ckpt_path = config.ckpt_path
    args.is_save_on_master = config.is_save_on_master
    args.rank = config.rank
    args.group_size = config.group_size
    args.lr_epochs = list(map(int, args.lr_epochs.split(',')))
    args.image_size = list(map(int, args.image_size.split(',')))

    context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=True,
                        device_target=args.platform, save_graphs=False)
    # init distributed
    if args.is_distributed:
        init()
        args.rank = get_rank()
        args.group_size = get_group_size()
    else:
        args.rank = 0
        args.group_size = 1

    if args.is_dynamic_loss_scale == 1:
        args.loss_scale = 1  # for dynamic loss scale can not set loss scale in momentum opt

    # select for master rank save ckpt or all rank save, compatiable for model parallel
    args.rank_save_ckpt_flag = 0
    if args.is_save_on_master:
        if args.rank == 0:
            args.rank_save_ckpt_flag = 1
    else:
        args.rank_save_ckpt_flag = 1

    # logger
    args.outputs_dir = os.path.join(args.ckpt_path,
                                    datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S'))
    args.logger = get_logger(args.outputs_dir, args.rank)
    return args
Beispiel #2
0
    context.set_context(mode=context.GRAPH_MODE,
                        device_target=args.device_target)

    # select for master rank save ckpt or all rank save, compatible for model parallel
    args.rank_save_ckpt_flag = 0
    if args.is_save_on_master:
        if args.rank == 0:
            args.rank_save_ckpt_flag = 1
    else:
        args.rank_save_ckpt_flag = 1

    # logger
    args.outputs_dir = os.path.join(
        args.ckpt_path,
        datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S'))
    args.logger = get_logger(args.outputs_dir, args.rank)

    if args.dataset == "cifar10":
        dataset = vgg_create_dataset(args.data_path, args.image_size,
                                     args.per_batch_size, args.rank,
                                     args.group_size)
    else:
        dataset = classification_dataset(args.data_path, args.image_size,
                                         args.per_batch_size, args.rank,
                                         args.group_size)

    batch_num = dataset.get_dataset_size()
    args.steps_per_epoch = dataset.get_dataset_size()
    args.logger.save_args(args)

    # network
Beispiel #3
0
        "n_graphs": len(graph_list),
        "classes": len(set(labels)),
        "density": avg_density(graph_list),
        "avg_nodes": avg_nodes(graph_list),
        "avg_edges": avg_edges(graph_list),
        "max_nodes": max_nodes(graph_list),
        "avg_degree": avg_degree(graph_list),
        "avg_max_degree": avg_max_degree(graph_list),
        "max_degree": max_degree(graph_list),
        "class_ratio": class_ratio(labels),
    }
    return stats


if __name__ == "__main__":
    logger = logging.get_logger()

    out_dir = "./reporting"
    utils.make_dirs_checked(out_dir)

    # create a list of all datasets in {args.data_dir}

    datasets = [
        "MUTAG", "PTC", "IMDBBINARY", "IMDBMULTI", "PROTEINS", "NCI1", "COLLAB"
    ]
    logger.info(f"Computing summary statistics for datasets {datasets}")
    # empty list to iteratively append the dictionaries containing dataset
    # stats to
    stats_list = []

    # iterating over all datasets
Beispiel #4
0
def parse_args(cloud_args=None):
    """parse_args"""
    parser = argparse.ArgumentParser('mindspore classification test')
    parser.add_argument('--platform',
                        type=str,
                        default='Ascend',
                        choices=('Ascend', 'GPU'),
                        help='run platform')

    # dataset related
    parser.add_argument('--data_dir',
                        type=str,
                        default='/opt/npu/datasets/classification/val',
                        help='eval data dir')
    parser.add_argument('--per_batch_size',
                        default=32,
                        type=int,
                        help='batch size for per npu')
    # network related
    parser.add_argument('--graph_ckpt',
                        action='store_true',
                        default=True,
                        help='graph ckpt or feed ckpt')
    parser.add_argument('--pretrained',
                        default='',
                        type=str,
                        help='fully path of pretrained model to load. '
                        'If it is a direction, it will test all ckpt')

    # logging related
    parser.add_argument('--log_path',
                        type=str,
                        default='outputs/',
                        help='path to save log')
    parser.add_argument('--is_distributed',
                        action='store_true',
                        default=False,
                        help='if multi device')

    # roma obs
    parser.add_argument('--train_url', type=str, default="", help='train url')

    args, _ = parser.parse_known_args()
    args = merge_args(args, cloud_args)
    args.image_size = config.image_size
    args.num_classes = config.num_classes
    args.rank = config.rank
    args.group_size = config.group_size

    args.image_size = list(map(int, args.image_size.split(',')))

    # init distributed
    if args.is_distributed:
        if args.platform == "Ascend":
            init()
        elif args.platform == "GPU":
            init("nccl")
        args.rank = get_rank()
        args.group_size = get_group_size()
    else:
        args.rank = 0
        args.group_size = 1

    args.outputs_dir = os.path.join(
        args.log_path,
        datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S'))

    args.logger = get_logger(args.outputs_dir, args.rank)
    return args
Beispiel #5
0
def test(cloud_args=None):
    """test"""
    args = parse_args(cloud_args)
    context.set_context(mode=context.GRAPH_MODE,
                        enable_auto_mixed_precision=True,
                        device_target=args.device_target,
                        save_graphs=False)
    if os.getenv('DEVICE_ID',
                 "not_set").isdigit() and args.device_target == "Ascend":
        context.set_context(device_id=int(os.getenv('DEVICE_ID')))

    args.outputs_dir = os.path.join(
        args.log_path,
        datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S'))

    args.logger = get_logger(args.outputs_dir, args.rank)
    args.logger.save_args(args)

    if args.dataset == "cifar10":
        net = vgg19(num_classes=args.num_classes, args=args)
        opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()),
                       0.01,
                       args.momentum,
                       weight_decay=args.weight_decay)
        loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
        model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'})

        param_dict = load_checkpoint(args.pre_trained)
        load_param_into_net(net, param_dict)
        net.set_train(False)
        dataset = vgg_create_dataset(args.data_path,
                                     args.image_size,
                                     args.per_batch_size,
                                     training=False)
        res = model.eval(dataset)
        print("result: ", res)
    else:
        # network
        args.logger.important_info('start create network')
        if os.path.isdir(args.pre_trained):
            models = list(glob.glob(os.path.join(args.pre_trained, '*.ckpt')))
            print(models)
            if args.graph_ckpt:
                f = lambda x: -1 * int(
                    os.path.splitext(os.path.split(x)[-1])[0].split('-')[-1].
                    split('_')[0])
            else:
                f = lambda x: -1 * int(
                    os.path.splitext(os.path.split(x)[-1])[0].split('_')[-1])
            args.models = sorted(models, key=f)
        else:
            args.models = [
                args.pre_trained,
            ]

        for model in args.models:
            dataset = classification_dataset(args.data_path,
                                             args.image_size,
                                             args.per_batch_size,
                                             mode='eval')
            eval_dataloader = dataset.create_tuple_iterator(output_numpy=True,
                                                            num_epochs=1)
            network = vgg19(args.num_classes, args, phase="test")

            # pre_trained
            load_param_into_net(network, load_checkpoint(model))
            network.add_flags_recursive(fp16=True)

            img_tot = 0
            top1_correct = 0
            top5_correct = 0

            network.set_train(False)
            t_end = time.time()
            it = 0
            for data, gt_classes in eval_dataloader:
                output = network(Tensor(data, mstype.float32))
                output = output.asnumpy()

                top1_output = np.argmax(output, (-1))
                top5_output = np.argsort(output)[:, -5:]

                t1_correct = np.equal(top1_output, gt_classes).sum()
                top1_correct += t1_correct
                top5_correct += get_top5_acc(top5_output, gt_classes)
                img_tot += args.per_batch_size

                if args.rank == 0 and it == 0:
                    t_end = time.time()
                    it = 1
            if args.rank == 0:
                time_used = time.time() - t_end
                fps = (img_tot -
                       args.per_batch_size) * args.group_size / time_used
                args.logger.info(
                    'Inference Performance: {:.2f} img/sec'.format(fps))
            results = [[top1_correct], [top5_correct], [img_tot]]
            args.logger.info('before results={}'.format(results))
            results = np.array(results)

            args.logger.info('after results={}'.format(results))
            top1_correct = results[0, 0]
            top5_correct = results[1, 0]
            img_tot = results[2, 0]
            acc1 = 100.0 * top1_correct / img_tot
            acc5 = 100.0 * top5_correct / img_tot
            args.logger.info('after allreduce eval: top1_correct={}, tot={},'
                             'acc={:.2f}%(TOP1)'.format(
                                 top1_correct, img_tot, acc1))
            args.logger.info('after allreduce eval: top5_correct={}, tot={},'
                             'acc={:.2f}%(TOP5)'.format(
                                 top5_correct, img_tot, acc5))
Beispiel #6
0
from pathlib import Path
from typing import Optional

from src.utils.decorators import timespan
from src.utils.logging import get_logger
from src.utils.params import ParamsBuilder
from tools.evaluate import main as evaluate
from tools.predict import main as predict
from tools.train import main as train

logger = get_logger("Pipeline")


@timespan("Pipeline")
def main(
    config_path: Path,
    dataset_path: Path,
    predict_path: Path,
    input_path: Path,
    evaluate_path: Optional[Path] = None,
) -> None:
    """
    Main function responsible for prediction with passed model.

    Arguments:
        Path config_path: Path to main config (of :class:`DefaultConfig` class)
        Path dataset_path: Path to dataset
        Path predict_path: Path to file with model predictions
        Path input_path: Path to file with input data
        Path evaluate_path: Path to evaluations
    """
Beispiel #7
0
def test(cloud_args=None):
    """test"""
    args = parse_args(cloud_args)
    context.set_context(mode=context.GRAPH_MODE,
                        enable_auto_mixed_precision=True,
                        device_target=args.platform,
                        save_graphs=False)
    if os.getenv('DEVICE_ID', "not_set").isdigit():
        context.set_context(device_id=int(os.getenv('DEVICE_ID')))

    # init distributed
    if args.is_distributed:
        init()
        args.rank = get_rank()
        args.group_size = get_group_size()
        parallel_mode = ParallelMode.DATA_PARALLEL
        context.set_auto_parallel_context(parallel_mode=parallel_mode,
                                          device_num=args.group_size,
                                          parameter_broadcast=True,
                                          gradients_mean=True)
    else:
        args.rank = 0
        args.group_size = 1

    args.outputs_dir = os.path.join(
        args.log_path,
        datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S'))

    args.logger = get_logger(args.outputs_dir, args.rank)
    args.logger.save_args(args)

    # network
    args.logger.important_info('start create network')
    if os.path.isdir(args.pretrained):
        models = list(glob.glob(os.path.join(args.pretrained, '*.ckpt')))
        print(models)
        if args.graph_ckpt:
            f = lambda x: -1 * int(
                os.path.splitext(os.path.split(x)[-1])[0].split('-')[-1].split(
                    '_')[0])
        else:
            f = lambda x: -1 * int(
                os.path.splitext(os.path.split(x)[-1])[0].split('_')[-1])
        args.models = sorted(models, key=f)
    else:
        args.models = [
            args.pretrained,
        ]

    for model in args.models:
        de_dataset = classification_dataset(args.data_dir,
                                            image_size=args.image_size,
                                            per_batch_size=args.per_batch_size,
                                            max_epoch=1,
                                            rank=args.rank,
                                            group_size=args.group_size,
                                            mode='eval')
        eval_dataloader = de_dataset.create_tuple_iterator(output_numpy=True)
        network = get_network(args.backbone,
                              args.num_classes,
                              platform=args.platform)
        if network is None:
            raise NotImplementedError('not implement {}'.format(args.backbone))

        param_dict = load_checkpoint(model)
        param_dict_new = {}
        for key, values in param_dict.items():
            if key.startswith('moments.'):
                continue
            elif key.startswith('network.'):
                param_dict_new[key[8:]] = values
            else:
                param_dict_new[key] = values

        load_param_into_net(network, param_dict_new)
        args.logger.info('load model {} success'.format(model))

        img_tot = 0
        top1_correct = 0
        top5_correct = 0
        if args.platform == "Ascend":
            network.to_float(mstype.float16)
        else:
            auto_mixed_precision(network)
        network.set_train(False)
        t_end = time.time()
        it = 0
        for data, gt_classes in eval_dataloader:
            output = network(Tensor(data, mstype.float32))
            output = output.asnumpy()

            top1_output = np.argmax(output, (-1))
            top5_output = np.argsort(output)[:, -5:]

            t1_correct = np.equal(top1_output, gt_classes).sum()
            top1_correct += t1_correct
            top5_correct += get_top5_acc(top5_output, gt_classes)
            img_tot += args.per_batch_size

            if args.rank == 0 and it == 0:
                t_end = time.time()
                it = 1
        if args.rank == 0:
            time_used = time.time() - t_end
            fps = (img_tot - args.per_batch_size) * args.group_size / time_used
            args.logger.info(
                'Inference Performance: {:.2f} img/sec'.format(fps))
        results = [[top1_correct], [top5_correct], [img_tot]]
        args.logger.info('before results={}'.format(results))
        if args.is_distributed:
            model_md5 = model.replace('/', '')
            tmp_dir = '/cache'
            if not os.path.exists(tmp_dir):
                os.mkdir(tmp_dir)
            top1_correct_npy = '/cache/top1_rank_{}_{}.npy'.format(
                args.rank, model_md5)
            top5_correct_npy = '/cache/top5_rank_{}_{}.npy'.format(
                args.rank, model_md5)
            img_tot_npy = '/cache/img_tot_rank_{}_{}.npy'.format(
                args.rank, model_md5)
            np.save(top1_correct_npy, top1_correct)
            np.save(top5_correct_npy, top5_correct)
            np.save(img_tot_npy, img_tot)
            while True:
                rank_ok = True
                for other_rank in range(args.group_size):
                    top1_correct_npy = '/cache/top1_rank_{}_{}.npy'.format(
                        other_rank, model_md5)
                    top5_correct_npy = '/cache/top5_rank_{}_{}.npy'.format(
                        other_rank, model_md5)
                    img_tot_npy = '/cache/img_tot_rank_{}_{}.npy'.format(
                        other_rank, model_md5)
                    if not os.path.exists(top1_correct_npy) or not os.path.exists(top5_correct_npy) or \
                       not os.path.exists(img_tot_npy):
                        rank_ok = False
                if rank_ok:
                    break

            top1_correct_all = 0
            top5_correct_all = 0
            img_tot_all = 0
            for other_rank in range(args.group_size):
                top1_correct_npy = '/cache/top1_rank_{}_{}.npy'.format(
                    other_rank, model_md5)
                top5_correct_npy = '/cache/top5_rank_{}_{}.npy'.format(
                    other_rank, model_md5)
                img_tot_npy = '/cache/img_tot_rank_{}_{}.npy'.format(
                    other_rank, model_md5)
                top1_correct_all += np.load(top1_correct_npy)
                top5_correct_all += np.load(top5_correct_npy)
                img_tot_all += np.load(img_tot_npy)
            results = [[top1_correct_all], [top5_correct_all], [img_tot_all]]
            results = np.array(results)
        else:
            results = np.array(results)

        args.logger.info('after results={}'.format(results))
        top1_correct = results[0, 0]
        top5_correct = results[1, 0]
        img_tot = results[2, 0]
        acc1 = 100.0 * top1_correct / img_tot
        acc5 = 100.0 * top5_correct / img_tot
        args.logger.info('after allreduce eval: top1_correct={}, tot={},'
                         'acc={:.2f}%(TOP1)'.format(top1_correct, img_tot,
                                                    acc1))
        args.logger.info('after allreduce eval: top5_correct={}, tot={},'
                         'acc={:.2f}%(TOP5)'.format(top5_correct, img_tot,
                                                    acc5))
    if args.is_distributed:
        release()
Beispiel #8
0
def test(cloud_args=None):
    """
    network eval function. Get top1 and top5 ACC from classification.
    The result will be save at [./outputs] by default.
    """
    args = parse_args(cloud_args)

    # init distributed
    if args.is_distributed:
        init()
        args.rank = get_rank()
        args.group_size = get_group_size()

    args.outputs_dir = os.path.join(
        args.log_path,
        datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S'))

    args.logger = get_logger(args.outputs_dir, args.rank)
    args.logger.save_args(args)

    # network
    args.logger.important_info('start create network')
    if os.path.isdir(args.pretrained):
        models = list(glob.glob(os.path.join(args.pretrained, '*.ckpt')))

        f = lambda x: -1 * int(
            os.path.splitext(os.path.split(x)[-1])[0].split('-')[-1].split('_')
            [0])

        args.models = sorted(models, key=f)
    else:
        args.models = [
            args.pretrained,
        ]

    for model in args.models:
        de_dataset = classification_dataset(args.data_dir,
                                            image_size=args.image_size,
                                            per_batch_size=args.per_batch_size,
                                            max_epoch=1,
                                            rank=args.rank,
                                            group_size=args.group_size,
                                            mode='eval')
        eval_dataloader = de_dataset.create_tuple_iterator()
        network = DenseNet121(args.num_classes)

        param_dict = load_checkpoint(model)
        param_dict_new = {}
        for key, values in param_dict.items():
            if key.startswith('moments.'):
                continue
            elif key.startswith('network.'):
                param_dict_new[key[8:]] = values
            else:
                param_dict_new[key] = values
        load_param_into_net(network, param_dict_new)
        args.logger.info('load model {} success'.format(model))

        network.add_flags_recursive(fp16=True)

        img_tot = 0
        top1_correct = 0
        top5_correct = 0
        network.set_train(False)
        for data, gt_classes in eval_dataloader:
            output = network(Tensor(data, mstype.float32))
            output = output.asnumpy()
            gt_classes = gt_classes.asnumpy()

            top1_output = np.argmax(output, (-1))
            top5_output = np.argsort(output)[:, -5:]

            t1_correct = np.equal(top1_output, gt_classes).sum()
            top1_correct += t1_correct
            top5_correct += get_top5_acc(top5_output, gt_classes)
            img_tot += args.per_batch_size

        results = [[top1_correct], [top5_correct], [img_tot]]
        args.logger.info('before results={}'.format(results))
        if args.is_distributed:
            model_md5 = model.replace('/', '')
            tmp_dir = '../cache'
            if not os.path.exists(tmp_dir):
                os.mkdir(tmp_dir)
            top1_correct_npy = '{}/top1_rank_{}_{}.npy'.format(
                tmp_dir, args.rank, model_md5)
            top5_correct_npy = '{}/top5_rank_{}_{}.npy'.format(
                tmp_dir, args.rank, model_md5)
            img_tot_npy = '{}/img_tot_rank_{}_{}.npy'.format(
                tmp_dir, args.rank, model_md5)
            np.save(top1_correct_npy, top1_correct)
            np.save(top5_correct_npy, top5_correct)
            np.save(img_tot_npy, img_tot)
            while True:
                rank_ok = True
                for other_rank in range(args.group_size):
                    top1_correct_npy = '{}/top1_rank_{}_{}.npy'.format(
                        tmp_dir, other_rank, model_md5)
                    top5_correct_npy = '{}/top5_rank_{}_{}.npy'.format(
                        tmp_dir, other_rank, model_md5)
                    img_tot_npy = '{}/img_tot_rank_{}_{}.npy'.format(
                        tmp_dir, other_rank, model_md5)
                    if not os.path.exists(top1_correct_npy) or not os.path.exists(top5_correct_npy) \
                       or not os.path.exists(img_tot_npy):
                        rank_ok = False
                if rank_ok:
                    break

            top1_correct_all = 0
            top5_correct_all = 0
            img_tot_all = 0
            for other_rank in range(args.group_size):
                top1_correct_npy = '{}/top1_rank_{}_{}.npy'.format(
                    tmp_dir, other_rank, model_md5)
                top5_correct_npy = '{}/top5_rank_{}_{}.npy'.format(
                    tmp_dir, other_rank, model_md5)
                img_tot_npy = '{}/img_tot_rank_{}_{}.npy'.format(
                    tmp_dir, other_rank, model_md5)
                top1_correct_all += np.load(top1_correct_npy)
                top5_correct_all += np.load(top5_correct_npy)
                img_tot_all += np.load(img_tot_npy)
            results = [[top1_correct_all], [top5_correct_all], [img_tot_all]]
            results = np.array(results)

        else:
            results = np.array(results)

        args.logger.info('after results={}'.format(results))
        top1_correct = results[0, 0]
        top5_correct = results[1, 0]
        img_tot = results[2, 0]
        acc1 = 100.0 * top1_correct / img_tot
        acc5 = 100.0 * top5_correct / img_tot
        args.logger.info(
            'after allreduce eval: top1_correct={}, tot={}, acc={:.2f}%'.
            format(top1_correct, img_tot, acc1))
        args.logger.info(
            'after allreduce eval: top5_correct={}, tot={}, acc={:.2f}%'.
            format(top5_correct, img_tot, acc5))
    if args.is_distributed:
        release()
Beispiel #9
0
def test(cloud_args=None):
    """
    network eval function. Get top1 and top5 ACC from classification for imagenet,
    and top1 ACC for cifar10.
    The result will be save at [./outputs] by default.
    """
    args = parse_args(cloud_args)

    context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target,
                        save_graphs=True)
    if args.device_target == 'Ascend':
        devid = int(os.getenv('DEVICE_ID'))
        context.set_context(device_id=devid)

    # init distributed
    if args.is_distributed:
        init()
        args.rank = get_rank()
        args.group_size = get_group_size()

    args.outputs_dir = os.path.join(args.log_path,
                                    datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S'))

    args.logger = get_logger(args.outputs_dir, args.rank)
    args.logger.save_args(args)

    # network
    args.logger.important_info('start create network')
    if os.path.isdir(args.pretrained):
        models = list(glob.glob(os.path.join(args.pretrained, '*.ckpt')))

        f = lambda x: -1 * int(os.path.splitext(os.path.split(x)[-1])[0].split('-')[-1].split('_')[0])

        args.models = sorted(models, key=f)
    else:
        args.models = [args.pretrained,]

    if args.net == "densenet100":
        from src.network.densenet import DenseNet100 as DenseNet
    else:
        from src.network.densenet import DenseNet121 as DenseNet

    if args.dataset == "cifar10":
        from src.datasets import classification_dataset_cifar10 as classification_dataset
    else:
        from src.datasets import classification_dataset_imagenet as classification_dataset

    for model in args.models:
        de_dataset = classification_dataset(args.data_dir, image_size=args.image_size,
                                            per_batch_size=args.per_batch_size,
                                            max_epoch=1, rank=args.rank, group_size=args.group_size,
                                            mode='eval')
        eval_dataloader = de_dataset.create_tuple_iterator()
        network = DenseNet(args.num_classes)

        param_dict = load_checkpoint(model)
        param_dict_new = {}
        for key, values in param_dict.items():
            if key.startswith('moments.'):
                continue
            elif key.startswith('network.'):
                param_dict_new[key[8:]] = values
            else:
                param_dict_new[key] = values
        load_param_into_net(network, param_dict_new)
        args.logger.info('load model {} success'.format(model))

        if args.device_target == 'Ascend':
            network.add_flags_recursive(fp16=True)

        img_tot = 0
        top1_correct = 0
        top5_correct = 0
        network.set_train(False)
        for data, gt_classes in eval_dataloader:
            output = network(Tensor(data, mstype.float32))
            output = output.asnumpy()
            gt_classes = gt_classes.asnumpy()

            top1_output = np.argmax(output, (-1))
            top5_output = np.argsort(output)[:, -5:]

            t1_correct = np.equal(top1_output, gt_classes).sum()
            top1_correct += t1_correct
            top5_correct += get_top5_acc(top5_output, gt_classes)
            img_tot += args.per_batch_size

        results = [[top1_correct], [top5_correct], [img_tot]]
        args.logger.info('before results={}'.format(results))
        if args.is_distributed:
            results = generate_results(model, args.rank, args.group_size, top1_correct,
                                       top5_correct, img_tot)
            results = np.array(results)
        else:
            results = np.array(results)

        args.logger.info('after results={}'.format(results))
        top1_correct = results[0, 0]
        top5_correct = results[1, 0]
        img_tot = results[2, 0]
        acc1 = 100.0 * top1_correct / img_tot
        acc5 = 100.0 * top5_correct / img_tot
        args.logger.info('after allreduce eval: top1_correct={}, tot={}, acc={:.2f}%'.format(top1_correct, img_tot,
                                                                                             acc1))
        if args.dataset == 'imagenet':
            args.logger.info('after allreduce eval: top5_correct={}, tot={}, acc={:.2f}%'.format(top5_correct, img_tot,
                                                                                                 acc5))
    if args.is_distributed:
        release()
Beispiel #10
0
import numpy as np
import json
import os
import pandas as pd
from tqdm import tqdm
from tabulate import tabulate

import sys

sys.path.append('.')

from src.config import config as cfg
from src.utils import utils
from src.utils import logging

logger = logging.get_logger("./logs")


def score_strings(acc_mean, acc_std):
    acc_mean = np.round(acc_mean * 100, 2)
    acc_std = np.round(acc_std * 100, 2)
    score = f'{acc_mean:.2f} ± {acc_std:.2f}'
    score_latex = f'${acc_mean:.2f} \\pm {acc_std:.2f}$'
    return score, score_latex


if __name__ == "__main__":
    config = cfg.Config()
    logger.info("-------------------------------------")
    logger.info("Evaluating experiment (a)")
    logger.info("-------------------------------------")
Beispiel #11
0
def train(cloud_args=None):
    """training process"""
    args = parse_args(cloud_args)

    context.set_context(mode=context.GRAPH_MODE,
                        enable_auto_mixed_precision=True,
                        device_target=args.device_target,
                        save_graphs=False)

    if args.device_target == 'Ascend':
        devid = int(os.getenv('DEVICE_ID'))
        context.set_context(device_id=devid)

    # init distributed
    if args.is_distributed:
        init()
        args.rank = get_rank()
        args.group_size = get_group_size()

    if args.is_dynamic_loss_scale == 1:
        args.loss_scale = 1  # for dynamic loss scale can not set loss scale in momentum opt

    # select for master rank save ckpt or all rank save, compatible for model parallel
    args.rank_save_ckpt_flag = 0
    if args.is_save_on_master:
        if args.rank == 0:
            args.rank_save_ckpt_flag = 1
    else:
        args.rank_save_ckpt_flag = 1

    # logger
    args.outputs_dir = os.path.join(
        args.ckpt_path,
        datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S'))
    args.logger = get_logger(args.outputs_dir, args.rank)

    # dataloader
    de_dataset = classification_dataset(args.data_dir, args.image_size,
                                        args.per_batch_size, args.max_epoch,
                                        args.rank, args.group_size)
    de_dataset.map_model = 4
    args.steps_per_epoch = de_dataset.get_dataset_size()

    args.logger.save_args(args)

    # network
    args.logger.important_info('start create network')
    # get network and init
    network = DenseNet121(args.num_classes)
    # loss
    if not args.label_smooth:
        args.label_smooth_factor = 0.0
    criterion = CrossEntropy(smooth_factor=args.label_smooth_factor,
                             num_classes=args.num_classes)

    # load pretrain model
    if os.path.isfile(args.pretrained):
        param_dict = load_checkpoint(args.pretrained)
        param_dict_new = {}
        for key, values in param_dict.items():
            if key.startswith('moments.'):
                continue
            elif key.startswith('network.'):
                param_dict_new[key[8:]] = values
            else:
                param_dict_new[key] = values
        load_param_into_net(network, param_dict_new)
        args.logger.info('load model {} success'.format(args.pretrained))

    # lr scheduler
    if args.lr_scheduler == 'exponential':
        lr_scheduler = MultiStepLR(args.lr,
                                   args.lr_epochs,
                                   args.lr_gamma,
                                   args.steps_per_epoch,
                                   args.max_epoch,
                                   warmup_epochs=args.warmup_epochs)
    elif args.lr_scheduler == 'cosine_annealing':
        lr_scheduler = CosineAnnealingLR(args.lr,
                                         args.T_max,
                                         args.steps_per_epoch,
                                         args.max_epoch,
                                         warmup_epochs=args.warmup_epochs,
                                         eta_min=args.eta_min)
    else:
        raise NotImplementedError(args.lr_scheduler)
    lr_schedule = lr_scheduler.get_lr()

    # optimizer
    opt = Momentum(params=get_param_groups(network),
                   learning_rate=Tensor(lr_schedule),
                   momentum=args.momentum,
                   weight_decay=args.weight_decay,
                   loss_scale=args.loss_scale)

    # mixed precision training
    criterion.add_flags_recursive(fp32=True)

    # package training process, adjust lr + forward + backward + optimizer
    train_net = BuildTrainNetwork(network, criterion)
    if args.is_distributed:
        parallel_mode = ParallelMode.DATA_PARALLEL
    else:
        parallel_mode = ParallelMode.STAND_ALONE
    if args.is_dynamic_loss_scale == 1:
        loss_scale_manager = DynamicLossScaleManager(init_loss_scale=65536,
                                                     scale_factor=2,
                                                     scale_window=2000)
    else:
        loss_scale_manager = FixedLossScaleManager(args.loss_scale,
                                                   drop_overflow_update=False)

    context.set_auto_parallel_context(parallel_mode=parallel_mode,
                                      device_num=args.group_size,
                                      gradients_mean=True)

    if args.device_target == 'Ascend':
        model = Model(train_net,
                      optimizer=opt,
                      metrics=None,
                      loss_scale_manager=loss_scale_manager,
                      amp_level="O3")
    elif args.device_target == 'GPU':
        model = Model(train_net,
                      optimizer=opt,
                      metrics=None,
                      loss_scale_manager=loss_scale_manager,
                      amp_level="O0")
    else:
        raise ValueError("Unsupported device target.")

    # checkpoint save
    progress_cb = ProgressMonitor(args)
    callbacks = [
        progress_cb,
    ]
    if args.rank_save_ckpt_flag:
        ckpt_max_num = args.max_epoch * args.steps_per_epoch // args.ckpt_interval
        ckpt_config = CheckpointConfig(
            save_checkpoint_steps=args.ckpt_interval,
            keep_checkpoint_max=ckpt_max_num)
        ckpt_cb = ModelCheckpoint(config=ckpt_config,
                                  directory=args.outputs_dir,
                                  prefix='{}'.format(args.rank))
        callbacks.append(ckpt_cb)

    model.train(args.max_epoch, de_dataset, callbacks=callbacks)
Beispiel #12
0
def train(cloud_args=None):
    """training process"""
    args = parse_args(cloud_args)
    context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=True,
                        device_target=args.platform, save_graphs=False)
    if os.getenv('DEVICE_ID', "not_set").isdigit():
        context.set_context(device_id=int(os.getenv('DEVICE_ID')))

    # init distributed
    if args.is_distributed:
        init()
        args.rank = get_rank()
        args.group_size = get_group_size()
        parallel_mode = ParallelMode.DATA_PARALLEL
        context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=args.group_size,
                                          parameter_broadcast=True, mirror_mean=True)
    else:
        args.rank = 0
        args.group_size = 1

    if args.is_dynamic_loss_scale == 1:
        args.loss_scale = 1  # for dynamic loss scale can not set loss scale in momentum opt

    # select for master rank save ckpt or all rank save, compatiable for model parallel
    args.rank_save_ckpt_flag = 0
    if args.is_save_on_master:
        if args.rank == 0:
            args.rank_save_ckpt_flag = 1
    else:
        args.rank_save_ckpt_flag = 1

    # logger
    args.outputs_dir = os.path.join(args.ckpt_path,
                                    datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S'))
    args.logger = get_logger(args.outputs_dir, args.rank)

    # dataloader
    de_dataset = classification_dataset(args.data_dir, args.image_size,
                                        args.per_batch_size, 1,
                                        args.rank, args.group_size, num_parallel_workers=8)
    de_dataset.map_model = 4  # !!!important
    args.steps_per_epoch = de_dataset.get_dataset_size()

    args.logger.save_args(args)

    # network
    args.logger.important_info('start create network')
    # get network and init
    network = get_network(args.backbone, args.num_classes, platform=args.platform)
    if network is None:
        raise NotImplementedError('not implement {}'.format(args.backbone))

    # load pretrain model
    if os.path.isfile(args.pretrained):
        param_dict = load_checkpoint(args.pretrained)
        param_dict_new = {}
        for key, values in param_dict.items():
            if key.startswith('moments.'):
                continue
            elif key.startswith('network.'):
                param_dict_new[key[8:]] = values
            else:
                param_dict_new[key] = values
        load_param_into_net(network, param_dict_new)
        args.logger.info('load model {} success'.format(args.pretrained))

    # lr scheduler
    if args.lr_scheduler == 'exponential':
        lr = warmup_step_lr(args.lr,
                            args.lr_epochs,
                            args.steps_per_epoch,
                            args.warmup_epochs,
                            args.max_epoch,
                            gamma=args.lr_gamma,
                            )
    elif args.lr_scheduler == 'cosine_annealing':
        lr = warmup_cosine_annealing_lr(args.lr,
                                        args.steps_per_epoch,
                                        args.warmup_epochs,
                                        args.max_epoch,
                                        args.T_max,
                                        args.eta_min)
    else:
        raise NotImplementedError(args.lr_scheduler)

    # optimizer
    opt = Momentum(params=get_param_groups(network),
                   learning_rate=Tensor(lr),
                   momentum=args.momentum,
                   weight_decay=args.weight_decay,
                   loss_scale=args.loss_scale)


    # loss
    if not args.label_smooth:
        args.label_smooth_factor = 0.0
    loss = CrossEntropy(smooth_factor=args.label_smooth_factor, num_classes=args.num_classes)

    if args.is_dynamic_loss_scale == 1:
        loss_scale_manager = DynamicLossScaleManager(init_loss_scale=65536, scale_factor=2, scale_window=2000)
    else:
        loss_scale_manager = FixedLossScaleManager(args.loss_scale, drop_overflow_update=False)

    if args.platform == "Ascend":
        model = Model(network, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale_manager,
                      metrics={'acc'}, amp_level="O3")
    else:
        model = Model(network, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale_manager,
                      metrics={'acc'}, amp_level="O2")

    # checkpoint save
    progress_cb = ProgressMonitor(args)
    callbacks = [progress_cb,]
    if args.rank_save_ckpt_flag:
        ckpt_config = CheckpointConfig(save_checkpoint_steps=args.ckpt_interval * args.steps_per_epoch,
                                       keep_checkpoint_max=args.ckpt_save_max)
        ckpt_cb = ModelCheckpoint(config=ckpt_config,
                                  directory=args.outputs_dir,
                                  prefix='{}'.format(args.rank))
        callbacks.append(ckpt_cb)

    model.train(args.max_epoch, de_dataset, callbacks=callbacks, dataset_sink_mode=True)
Beispiel #13
0
from pathlib import Path
from typing import Any, Iterable, Optional, Union

import numpy as np
import torch

from torch.utils.data.dataloader import DataLoader as PLDataloader
from torch.utils.data.dataset import Dataset
from tqdm import tqdm

from configs import DefaultConfig
from src.base.transformation import BaseTransformation
from src.utils.checkers import image_folder
from src.utils.logging import get_logger

logger = get_logger("PrepareDataloader")


class DataLoader(PLDataloader):
    def __init__(
        self, dataset: Iterable, config: DefaultConfig, ds_type: str, **kwargs,
    ):
        self.set_params(config=config, ds_type=ds_type)
        transformed_dataset = self.apply_tfms(dataset, ds_type)

        super().__init__(
            dataset=transformed_dataset,
            batch_size=self.batch_size,
            shuffle=self.shuffle,
            **kwargs,
        )
"""Load YAML config files"""
import sys
import os

import yaml
import src.utils.logging as logging

logger = logging.get_logger(__name__)


class ConfigLoader:
    @staticmethod
    def _load_yaml_content(fname):
        """Load and check content of a given YAML file name
        Args:
            fname: path to the config file
        Return:
            content: content of the YAML file
        """
        assert os.path.isfile(fname), 'Config file not found: {}'.format(fname)

        with open(fname, 'r') as stream:
            try:
                content = yaml.safe_load(stream)
            except yaml.YAMLError as exc:
                logger.error(exc)
                sys.exit(-1)
        return content

    @staticmethod
    def load_model_cfg(fname):