def parse_args(cloud_args=None): """parameters""" parser = argparse.ArgumentParser('mindspore classification training') parser.add_argument('--platform', type=str, default='Ascend', choices=('Ascend', 'GPU'), help='run platform') # dataset related parser.add_argument('--data_dir', type=str, default='', help='train data dir') parser.add_argument('--per_batch_size', default=128, type=int, help='batch size for per gpu') # network related parser.add_argument('--pretrained', default='', type=str, help='model_path, local pretrained model to load') # distributed related parser.add_argument('--is_distributed', type=int, default=1, help='if multi device') # roma obs parser.add_argument('--train_url', type=str, default="", help='train url') args, _ = parser.parse_known_args() args = merge_args(args, cloud_args) args.image_size = config.image_size args.num_classes = config.num_classes args.lr = config.lr args.lr_scheduler = config.lr_scheduler args.lr_epochs = config.lr_epochs args.lr_gamma = config.lr_gamma args.eta_min = config.eta_min args.T_max = config.T_max args.max_epoch = config.max_epoch args.warmup_epochs = config.warmup_epochs args.weight_decay = config.weight_decay args.momentum = config.momentum args.is_dynamic_loss_scale = config.is_dynamic_loss_scale args.loss_scale = config.loss_scale args.label_smooth = config.label_smooth args.label_smooth_factor = config.label_smooth_factor args.ckpt_interval = config.ckpt_interval args.ckpt_save_max = config.ckpt_save_max args.ckpt_path = config.ckpt_path args.is_save_on_master = config.is_save_on_master args.rank = config.rank args.group_size = config.group_size args.lr_epochs = list(map(int, args.lr_epochs.split(','))) args.image_size = list(map(int, args.image_size.split(','))) context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=True, device_target=args.platform, save_graphs=False) # init distributed if args.is_distributed: init() args.rank = get_rank() args.group_size = get_group_size() else: args.rank = 0 args.group_size = 1 if args.is_dynamic_loss_scale == 1: args.loss_scale = 1 # for dynamic loss scale can not set loss scale in momentum opt # select for master rank save ckpt or all rank save, compatiable for model parallel args.rank_save_ckpt_flag = 0 if args.is_save_on_master: if args.rank == 0: args.rank_save_ckpt_flag = 1 else: args.rank_save_ckpt_flag = 1 # logger args.outputs_dir = os.path.join(args.ckpt_path, datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S')) args.logger = get_logger(args.outputs_dir, args.rank) return args
context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) # select for master rank save ckpt or all rank save, compatible for model parallel args.rank_save_ckpt_flag = 0 if args.is_save_on_master: if args.rank == 0: args.rank_save_ckpt_flag = 1 else: args.rank_save_ckpt_flag = 1 # logger args.outputs_dir = os.path.join( args.ckpt_path, datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S')) args.logger = get_logger(args.outputs_dir, args.rank) if args.dataset == "cifar10": dataset = vgg_create_dataset(args.data_path, args.image_size, args.per_batch_size, args.rank, args.group_size) else: dataset = classification_dataset(args.data_path, args.image_size, args.per_batch_size, args.rank, args.group_size) batch_num = dataset.get_dataset_size() args.steps_per_epoch = dataset.get_dataset_size() args.logger.save_args(args) # network
"n_graphs": len(graph_list), "classes": len(set(labels)), "density": avg_density(graph_list), "avg_nodes": avg_nodes(graph_list), "avg_edges": avg_edges(graph_list), "max_nodes": max_nodes(graph_list), "avg_degree": avg_degree(graph_list), "avg_max_degree": avg_max_degree(graph_list), "max_degree": max_degree(graph_list), "class_ratio": class_ratio(labels), } return stats if __name__ == "__main__": logger = logging.get_logger() out_dir = "./reporting" utils.make_dirs_checked(out_dir) # create a list of all datasets in {args.data_dir} datasets = [ "MUTAG", "PTC", "IMDBBINARY", "IMDBMULTI", "PROTEINS", "NCI1", "COLLAB" ] logger.info(f"Computing summary statistics for datasets {datasets}") # empty list to iteratively append the dictionaries containing dataset # stats to stats_list = [] # iterating over all datasets
def parse_args(cloud_args=None): """parse_args""" parser = argparse.ArgumentParser('mindspore classification test') parser.add_argument('--platform', type=str, default='Ascend', choices=('Ascend', 'GPU'), help='run platform') # dataset related parser.add_argument('--data_dir', type=str, default='/opt/npu/datasets/classification/val', help='eval data dir') parser.add_argument('--per_batch_size', default=32, type=int, help='batch size for per npu') # network related parser.add_argument('--graph_ckpt', action='store_true', default=True, help='graph ckpt or feed ckpt') parser.add_argument('--pretrained', default='', type=str, help='fully path of pretrained model to load. ' 'If it is a direction, it will test all ckpt') # logging related parser.add_argument('--log_path', type=str, default='outputs/', help='path to save log') parser.add_argument('--is_distributed', action='store_true', default=False, help='if multi device') # roma obs parser.add_argument('--train_url', type=str, default="", help='train url') args, _ = parser.parse_known_args() args = merge_args(args, cloud_args) args.image_size = config.image_size args.num_classes = config.num_classes args.rank = config.rank args.group_size = config.group_size args.image_size = list(map(int, args.image_size.split(','))) # init distributed if args.is_distributed: if args.platform == "Ascend": init() elif args.platform == "GPU": init("nccl") args.rank = get_rank() args.group_size = get_group_size() else: args.rank = 0 args.group_size = 1 args.outputs_dir = os.path.join( args.log_path, datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S')) args.logger = get_logger(args.outputs_dir, args.rank) return args
def test(cloud_args=None): """test""" args = parse_args(cloud_args) context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=True, device_target=args.device_target, save_graphs=False) if os.getenv('DEVICE_ID', "not_set").isdigit() and args.device_target == "Ascend": context.set_context(device_id=int(os.getenv('DEVICE_ID'))) args.outputs_dir = os.path.join( args.log_path, datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S')) args.logger = get_logger(args.outputs_dir, args.rank) args.logger.save_args(args) if args.dataset == "cifar10": net = vgg19(num_classes=args.num_classes, args=args) opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), 0.01, args.momentum, weight_decay=args.weight_decay) loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'}) param_dict = load_checkpoint(args.pre_trained) load_param_into_net(net, param_dict) net.set_train(False) dataset = vgg_create_dataset(args.data_path, args.image_size, args.per_batch_size, training=False) res = model.eval(dataset) print("result: ", res) else: # network args.logger.important_info('start create network') if os.path.isdir(args.pre_trained): models = list(glob.glob(os.path.join(args.pre_trained, '*.ckpt'))) print(models) if args.graph_ckpt: f = lambda x: -1 * int( os.path.splitext(os.path.split(x)[-1])[0].split('-')[-1]. split('_')[0]) else: f = lambda x: -1 * int( os.path.splitext(os.path.split(x)[-1])[0].split('_')[-1]) args.models = sorted(models, key=f) else: args.models = [ args.pre_trained, ] for model in args.models: dataset = classification_dataset(args.data_path, args.image_size, args.per_batch_size, mode='eval') eval_dataloader = dataset.create_tuple_iterator(output_numpy=True, num_epochs=1) network = vgg19(args.num_classes, args, phase="test") # pre_trained load_param_into_net(network, load_checkpoint(model)) network.add_flags_recursive(fp16=True) img_tot = 0 top1_correct = 0 top5_correct = 0 network.set_train(False) t_end = time.time() it = 0 for data, gt_classes in eval_dataloader: output = network(Tensor(data, mstype.float32)) output = output.asnumpy() top1_output = np.argmax(output, (-1)) top5_output = np.argsort(output)[:, -5:] t1_correct = np.equal(top1_output, gt_classes).sum() top1_correct += t1_correct top5_correct += get_top5_acc(top5_output, gt_classes) img_tot += args.per_batch_size if args.rank == 0 and it == 0: t_end = time.time() it = 1 if args.rank == 0: time_used = time.time() - t_end fps = (img_tot - args.per_batch_size) * args.group_size / time_used args.logger.info( 'Inference Performance: {:.2f} img/sec'.format(fps)) results = [[top1_correct], [top5_correct], [img_tot]] args.logger.info('before results={}'.format(results)) results = np.array(results) args.logger.info('after results={}'.format(results)) top1_correct = results[0, 0] top5_correct = results[1, 0] img_tot = results[2, 0] acc1 = 100.0 * top1_correct / img_tot acc5 = 100.0 * top5_correct / img_tot args.logger.info('after allreduce eval: top1_correct={}, tot={},' 'acc={:.2f}%(TOP1)'.format( top1_correct, img_tot, acc1)) args.logger.info('after allreduce eval: top5_correct={}, tot={},' 'acc={:.2f}%(TOP5)'.format( top5_correct, img_tot, acc5))
from pathlib import Path from typing import Optional from src.utils.decorators import timespan from src.utils.logging import get_logger from src.utils.params import ParamsBuilder from tools.evaluate import main as evaluate from tools.predict import main as predict from tools.train import main as train logger = get_logger("Pipeline") @timespan("Pipeline") def main( config_path: Path, dataset_path: Path, predict_path: Path, input_path: Path, evaluate_path: Optional[Path] = None, ) -> None: """ Main function responsible for prediction with passed model. Arguments: Path config_path: Path to main config (of :class:`DefaultConfig` class) Path dataset_path: Path to dataset Path predict_path: Path to file with model predictions Path input_path: Path to file with input data Path evaluate_path: Path to evaluations """
def test(cloud_args=None): """test""" args = parse_args(cloud_args) context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=True, device_target=args.platform, save_graphs=False) if os.getenv('DEVICE_ID', "not_set").isdigit(): context.set_context(device_id=int(os.getenv('DEVICE_ID'))) # init distributed if args.is_distributed: init() args.rank = get_rank() args.group_size = get_group_size() parallel_mode = ParallelMode.DATA_PARALLEL context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=args.group_size, parameter_broadcast=True, gradients_mean=True) else: args.rank = 0 args.group_size = 1 args.outputs_dir = os.path.join( args.log_path, datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S')) args.logger = get_logger(args.outputs_dir, args.rank) args.logger.save_args(args) # network args.logger.important_info('start create network') if os.path.isdir(args.pretrained): models = list(glob.glob(os.path.join(args.pretrained, '*.ckpt'))) print(models) if args.graph_ckpt: f = lambda x: -1 * int( os.path.splitext(os.path.split(x)[-1])[0].split('-')[-1].split( '_')[0]) else: f = lambda x: -1 * int( os.path.splitext(os.path.split(x)[-1])[0].split('_')[-1]) args.models = sorted(models, key=f) else: args.models = [ args.pretrained, ] for model in args.models: de_dataset = classification_dataset(args.data_dir, image_size=args.image_size, per_batch_size=args.per_batch_size, max_epoch=1, rank=args.rank, group_size=args.group_size, mode='eval') eval_dataloader = de_dataset.create_tuple_iterator(output_numpy=True) network = get_network(args.backbone, args.num_classes, platform=args.platform) if network is None: raise NotImplementedError('not implement {}'.format(args.backbone)) param_dict = load_checkpoint(model) param_dict_new = {} for key, values in param_dict.items(): if key.startswith('moments.'): continue elif key.startswith('network.'): param_dict_new[key[8:]] = values else: param_dict_new[key] = values load_param_into_net(network, param_dict_new) args.logger.info('load model {} success'.format(model)) img_tot = 0 top1_correct = 0 top5_correct = 0 if args.platform == "Ascend": network.to_float(mstype.float16) else: auto_mixed_precision(network) network.set_train(False) t_end = time.time() it = 0 for data, gt_classes in eval_dataloader: output = network(Tensor(data, mstype.float32)) output = output.asnumpy() top1_output = np.argmax(output, (-1)) top5_output = np.argsort(output)[:, -5:] t1_correct = np.equal(top1_output, gt_classes).sum() top1_correct += t1_correct top5_correct += get_top5_acc(top5_output, gt_classes) img_tot += args.per_batch_size if args.rank == 0 and it == 0: t_end = time.time() it = 1 if args.rank == 0: time_used = time.time() - t_end fps = (img_tot - args.per_batch_size) * args.group_size / time_used args.logger.info( 'Inference Performance: {:.2f} img/sec'.format(fps)) results = [[top1_correct], [top5_correct], [img_tot]] args.logger.info('before results={}'.format(results)) if args.is_distributed: model_md5 = model.replace('/', '') tmp_dir = '/cache' if not os.path.exists(tmp_dir): os.mkdir(tmp_dir) top1_correct_npy = '/cache/top1_rank_{}_{}.npy'.format( args.rank, model_md5) top5_correct_npy = '/cache/top5_rank_{}_{}.npy'.format( args.rank, model_md5) img_tot_npy = '/cache/img_tot_rank_{}_{}.npy'.format( args.rank, model_md5) np.save(top1_correct_npy, top1_correct) np.save(top5_correct_npy, top5_correct) np.save(img_tot_npy, img_tot) while True: rank_ok = True for other_rank in range(args.group_size): top1_correct_npy = '/cache/top1_rank_{}_{}.npy'.format( other_rank, model_md5) top5_correct_npy = '/cache/top5_rank_{}_{}.npy'.format( other_rank, model_md5) img_tot_npy = '/cache/img_tot_rank_{}_{}.npy'.format( other_rank, model_md5) if not os.path.exists(top1_correct_npy) or not os.path.exists(top5_correct_npy) or \ not os.path.exists(img_tot_npy): rank_ok = False if rank_ok: break top1_correct_all = 0 top5_correct_all = 0 img_tot_all = 0 for other_rank in range(args.group_size): top1_correct_npy = '/cache/top1_rank_{}_{}.npy'.format( other_rank, model_md5) top5_correct_npy = '/cache/top5_rank_{}_{}.npy'.format( other_rank, model_md5) img_tot_npy = '/cache/img_tot_rank_{}_{}.npy'.format( other_rank, model_md5) top1_correct_all += np.load(top1_correct_npy) top5_correct_all += np.load(top5_correct_npy) img_tot_all += np.load(img_tot_npy) results = [[top1_correct_all], [top5_correct_all], [img_tot_all]] results = np.array(results) else: results = np.array(results) args.logger.info('after results={}'.format(results)) top1_correct = results[0, 0] top5_correct = results[1, 0] img_tot = results[2, 0] acc1 = 100.0 * top1_correct / img_tot acc5 = 100.0 * top5_correct / img_tot args.logger.info('after allreduce eval: top1_correct={}, tot={},' 'acc={:.2f}%(TOP1)'.format(top1_correct, img_tot, acc1)) args.logger.info('after allreduce eval: top5_correct={}, tot={},' 'acc={:.2f}%(TOP5)'.format(top5_correct, img_tot, acc5)) if args.is_distributed: release()
def test(cloud_args=None): """ network eval function. Get top1 and top5 ACC from classification. The result will be save at [./outputs] by default. """ args = parse_args(cloud_args) # init distributed if args.is_distributed: init() args.rank = get_rank() args.group_size = get_group_size() args.outputs_dir = os.path.join( args.log_path, datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S')) args.logger = get_logger(args.outputs_dir, args.rank) args.logger.save_args(args) # network args.logger.important_info('start create network') if os.path.isdir(args.pretrained): models = list(glob.glob(os.path.join(args.pretrained, '*.ckpt'))) f = lambda x: -1 * int( os.path.splitext(os.path.split(x)[-1])[0].split('-')[-1].split('_') [0]) args.models = sorted(models, key=f) else: args.models = [ args.pretrained, ] for model in args.models: de_dataset = classification_dataset(args.data_dir, image_size=args.image_size, per_batch_size=args.per_batch_size, max_epoch=1, rank=args.rank, group_size=args.group_size, mode='eval') eval_dataloader = de_dataset.create_tuple_iterator() network = DenseNet121(args.num_classes) param_dict = load_checkpoint(model) param_dict_new = {} for key, values in param_dict.items(): if key.startswith('moments.'): continue elif key.startswith('network.'): param_dict_new[key[8:]] = values else: param_dict_new[key] = values load_param_into_net(network, param_dict_new) args.logger.info('load model {} success'.format(model)) network.add_flags_recursive(fp16=True) img_tot = 0 top1_correct = 0 top5_correct = 0 network.set_train(False) for data, gt_classes in eval_dataloader: output = network(Tensor(data, mstype.float32)) output = output.asnumpy() gt_classes = gt_classes.asnumpy() top1_output = np.argmax(output, (-1)) top5_output = np.argsort(output)[:, -5:] t1_correct = np.equal(top1_output, gt_classes).sum() top1_correct += t1_correct top5_correct += get_top5_acc(top5_output, gt_classes) img_tot += args.per_batch_size results = [[top1_correct], [top5_correct], [img_tot]] args.logger.info('before results={}'.format(results)) if args.is_distributed: model_md5 = model.replace('/', '') tmp_dir = '../cache' if not os.path.exists(tmp_dir): os.mkdir(tmp_dir) top1_correct_npy = '{}/top1_rank_{}_{}.npy'.format( tmp_dir, args.rank, model_md5) top5_correct_npy = '{}/top5_rank_{}_{}.npy'.format( tmp_dir, args.rank, model_md5) img_tot_npy = '{}/img_tot_rank_{}_{}.npy'.format( tmp_dir, args.rank, model_md5) np.save(top1_correct_npy, top1_correct) np.save(top5_correct_npy, top5_correct) np.save(img_tot_npy, img_tot) while True: rank_ok = True for other_rank in range(args.group_size): top1_correct_npy = '{}/top1_rank_{}_{}.npy'.format( tmp_dir, other_rank, model_md5) top5_correct_npy = '{}/top5_rank_{}_{}.npy'.format( tmp_dir, other_rank, model_md5) img_tot_npy = '{}/img_tot_rank_{}_{}.npy'.format( tmp_dir, other_rank, model_md5) if not os.path.exists(top1_correct_npy) or not os.path.exists(top5_correct_npy) \ or not os.path.exists(img_tot_npy): rank_ok = False if rank_ok: break top1_correct_all = 0 top5_correct_all = 0 img_tot_all = 0 for other_rank in range(args.group_size): top1_correct_npy = '{}/top1_rank_{}_{}.npy'.format( tmp_dir, other_rank, model_md5) top5_correct_npy = '{}/top5_rank_{}_{}.npy'.format( tmp_dir, other_rank, model_md5) img_tot_npy = '{}/img_tot_rank_{}_{}.npy'.format( tmp_dir, other_rank, model_md5) top1_correct_all += np.load(top1_correct_npy) top5_correct_all += np.load(top5_correct_npy) img_tot_all += np.load(img_tot_npy) results = [[top1_correct_all], [top5_correct_all], [img_tot_all]] results = np.array(results) else: results = np.array(results) args.logger.info('after results={}'.format(results)) top1_correct = results[0, 0] top5_correct = results[1, 0] img_tot = results[2, 0] acc1 = 100.0 * top1_correct / img_tot acc5 = 100.0 * top5_correct / img_tot args.logger.info( 'after allreduce eval: top1_correct={}, tot={}, acc={:.2f}%'. format(top1_correct, img_tot, acc1)) args.logger.info( 'after allreduce eval: top5_correct={}, tot={}, acc={:.2f}%'. format(top5_correct, img_tot, acc5)) if args.is_distributed: release()
def test(cloud_args=None): """ network eval function. Get top1 and top5 ACC from classification for imagenet, and top1 ACC for cifar10. The result will be save at [./outputs] by default. """ args = parse_args(cloud_args) context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, save_graphs=True) if args.device_target == 'Ascend': devid = int(os.getenv('DEVICE_ID')) context.set_context(device_id=devid) # init distributed if args.is_distributed: init() args.rank = get_rank() args.group_size = get_group_size() args.outputs_dir = os.path.join(args.log_path, datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S')) args.logger = get_logger(args.outputs_dir, args.rank) args.logger.save_args(args) # network args.logger.important_info('start create network') if os.path.isdir(args.pretrained): models = list(glob.glob(os.path.join(args.pretrained, '*.ckpt'))) f = lambda x: -1 * int(os.path.splitext(os.path.split(x)[-1])[0].split('-')[-1].split('_')[0]) args.models = sorted(models, key=f) else: args.models = [args.pretrained,] if args.net == "densenet100": from src.network.densenet import DenseNet100 as DenseNet else: from src.network.densenet import DenseNet121 as DenseNet if args.dataset == "cifar10": from src.datasets import classification_dataset_cifar10 as classification_dataset else: from src.datasets import classification_dataset_imagenet as classification_dataset for model in args.models: de_dataset = classification_dataset(args.data_dir, image_size=args.image_size, per_batch_size=args.per_batch_size, max_epoch=1, rank=args.rank, group_size=args.group_size, mode='eval') eval_dataloader = de_dataset.create_tuple_iterator() network = DenseNet(args.num_classes) param_dict = load_checkpoint(model) param_dict_new = {} for key, values in param_dict.items(): if key.startswith('moments.'): continue elif key.startswith('network.'): param_dict_new[key[8:]] = values else: param_dict_new[key] = values load_param_into_net(network, param_dict_new) args.logger.info('load model {} success'.format(model)) if args.device_target == 'Ascend': network.add_flags_recursive(fp16=True) img_tot = 0 top1_correct = 0 top5_correct = 0 network.set_train(False) for data, gt_classes in eval_dataloader: output = network(Tensor(data, mstype.float32)) output = output.asnumpy() gt_classes = gt_classes.asnumpy() top1_output = np.argmax(output, (-1)) top5_output = np.argsort(output)[:, -5:] t1_correct = np.equal(top1_output, gt_classes).sum() top1_correct += t1_correct top5_correct += get_top5_acc(top5_output, gt_classes) img_tot += args.per_batch_size results = [[top1_correct], [top5_correct], [img_tot]] args.logger.info('before results={}'.format(results)) if args.is_distributed: results = generate_results(model, args.rank, args.group_size, top1_correct, top5_correct, img_tot) results = np.array(results) else: results = np.array(results) args.logger.info('after results={}'.format(results)) top1_correct = results[0, 0] top5_correct = results[1, 0] img_tot = results[2, 0] acc1 = 100.0 * top1_correct / img_tot acc5 = 100.0 * top5_correct / img_tot args.logger.info('after allreduce eval: top1_correct={}, tot={}, acc={:.2f}%'.format(top1_correct, img_tot, acc1)) if args.dataset == 'imagenet': args.logger.info('after allreduce eval: top5_correct={}, tot={}, acc={:.2f}%'.format(top5_correct, img_tot, acc5)) if args.is_distributed: release()
import numpy as np import json import os import pandas as pd from tqdm import tqdm from tabulate import tabulate import sys sys.path.append('.') from src.config import config as cfg from src.utils import utils from src.utils import logging logger = logging.get_logger("./logs") def score_strings(acc_mean, acc_std): acc_mean = np.round(acc_mean * 100, 2) acc_std = np.round(acc_std * 100, 2) score = f'{acc_mean:.2f} ± {acc_std:.2f}' score_latex = f'${acc_mean:.2f} \\pm {acc_std:.2f}$' return score, score_latex if __name__ == "__main__": config = cfg.Config() logger.info("-------------------------------------") logger.info("Evaluating experiment (a)") logger.info("-------------------------------------")
def train(cloud_args=None): """training process""" args = parse_args(cloud_args) context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=True, device_target=args.device_target, save_graphs=False) if args.device_target == 'Ascend': devid = int(os.getenv('DEVICE_ID')) context.set_context(device_id=devid) # init distributed if args.is_distributed: init() args.rank = get_rank() args.group_size = get_group_size() if args.is_dynamic_loss_scale == 1: args.loss_scale = 1 # for dynamic loss scale can not set loss scale in momentum opt # select for master rank save ckpt or all rank save, compatible for model parallel args.rank_save_ckpt_flag = 0 if args.is_save_on_master: if args.rank == 0: args.rank_save_ckpt_flag = 1 else: args.rank_save_ckpt_flag = 1 # logger args.outputs_dir = os.path.join( args.ckpt_path, datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S')) args.logger = get_logger(args.outputs_dir, args.rank) # dataloader de_dataset = classification_dataset(args.data_dir, args.image_size, args.per_batch_size, args.max_epoch, args.rank, args.group_size) de_dataset.map_model = 4 args.steps_per_epoch = de_dataset.get_dataset_size() args.logger.save_args(args) # network args.logger.important_info('start create network') # get network and init network = DenseNet121(args.num_classes) # loss if not args.label_smooth: args.label_smooth_factor = 0.0 criterion = CrossEntropy(smooth_factor=args.label_smooth_factor, num_classes=args.num_classes) # load pretrain model if os.path.isfile(args.pretrained): param_dict = load_checkpoint(args.pretrained) param_dict_new = {} for key, values in param_dict.items(): if key.startswith('moments.'): continue elif key.startswith('network.'): param_dict_new[key[8:]] = values else: param_dict_new[key] = values load_param_into_net(network, param_dict_new) args.logger.info('load model {} success'.format(args.pretrained)) # lr scheduler if args.lr_scheduler == 'exponential': lr_scheduler = MultiStepLR(args.lr, args.lr_epochs, args.lr_gamma, args.steps_per_epoch, args.max_epoch, warmup_epochs=args.warmup_epochs) elif args.lr_scheduler == 'cosine_annealing': lr_scheduler = CosineAnnealingLR(args.lr, args.T_max, args.steps_per_epoch, args.max_epoch, warmup_epochs=args.warmup_epochs, eta_min=args.eta_min) else: raise NotImplementedError(args.lr_scheduler) lr_schedule = lr_scheduler.get_lr() # optimizer opt = Momentum(params=get_param_groups(network), learning_rate=Tensor(lr_schedule), momentum=args.momentum, weight_decay=args.weight_decay, loss_scale=args.loss_scale) # mixed precision training criterion.add_flags_recursive(fp32=True) # package training process, adjust lr + forward + backward + optimizer train_net = BuildTrainNetwork(network, criterion) if args.is_distributed: parallel_mode = ParallelMode.DATA_PARALLEL else: parallel_mode = ParallelMode.STAND_ALONE if args.is_dynamic_loss_scale == 1: loss_scale_manager = DynamicLossScaleManager(init_loss_scale=65536, scale_factor=2, scale_window=2000) else: loss_scale_manager = FixedLossScaleManager(args.loss_scale, drop_overflow_update=False) context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=args.group_size, gradients_mean=True) if args.device_target == 'Ascend': model = Model(train_net, optimizer=opt, metrics=None, loss_scale_manager=loss_scale_manager, amp_level="O3") elif args.device_target == 'GPU': model = Model(train_net, optimizer=opt, metrics=None, loss_scale_manager=loss_scale_manager, amp_level="O0") else: raise ValueError("Unsupported device target.") # checkpoint save progress_cb = ProgressMonitor(args) callbacks = [ progress_cb, ] if args.rank_save_ckpt_flag: ckpt_max_num = args.max_epoch * args.steps_per_epoch // args.ckpt_interval ckpt_config = CheckpointConfig( save_checkpoint_steps=args.ckpt_interval, keep_checkpoint_max=ckpt_max_num) ckpt_cb = ModelCheckpoint(config=ckpt_config, directory=args.outputs_dir, prefix='{}'.format(args.rank)) callbacks.append(ckpt_cb) model.train(args.max_epoch, de_dataset, callbacks=callbacks)
def train(cloud_args=None): """training process""" args = parse_args(cloud_args) context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=True, device_target=args.platform, save_graphs=False) if os.getenv('DEVICE_ID', "not_set").isdigit(): context.set_context(device_id=int(os.getenv('DEVICE_ID'))) # init distributed if args.is_distributed: init() args.rank = get_rank() args.group_size = get_group_size() parallel_mode = ParallelMode.DATA_PARALLEL context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=args.group_size, parameter_broadcast=True, mirror_mean=True) else: args.rank = 0 args.group_size = 1 if args.is_dynamic_loss_scale == 1: args.loss_scale = 1 # for dynamic loss scale can not set loss scale in momentum opt # select for master rank save ckpt or all rank save, compatiable for model parallel args.rank_save_ckpt_flag = 0 if args.is_save_on_master: if args.rank == 0: args.rank_save_ckpt_flag = 1 else: args.rank_save_ckpt_flag = 1 # logger args.outputs_dir = os.path.join(args.ckpt_path, datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S')) args.logger = get_logger(args.outputs_dir, args.rank) # dataloader de_dataset = classification_dataset(args.data_dir, args.image_size, args.per_batch_size, 1, args.rank, args.group_size, num_parallel_workers=8) de_dataset.map_model = 4 # !!!important args.steps_per_epoch = de_dataset.get_dataset_size() args.logger.save_args(args) # network args.logger.important_info('start create network') # get network and init network = get_network(args.backbone, args.num_classes, platform=args.platform) if network is None: raise NotImplementedError('not implement {}'.format(args.backbone)) # load pretrain model if os.path.isfile(args.pretrained): param_dict = load_checkpoint(args.pretrained) param_dict_new = {} for key, values in param_dict.items(): if key.startswith('moments.'): continue elif key.startswith('network.'): param_dict_new[key[8:]] = values else: param_dict_new[key] = values load_param_into_net(network, param_dict_new) args.logger.info('load model {} success'.format(args.pretrained)) # lr scheduler if args.lr_scheduler == 'exponential': lr = warmup_step_lr(args.lr, args.lr_epochs, args.steps_per_epoch, args.warmup_epochs, args.max_epoch, gamma=args.lr_gamma, ) elif args.lr_scheduler == 'cosine_annealing': lr = warmup_cosine_annealing_lr(args.lr, args.steps_per_epoch, args.warmup_epochs, args.max_epoch, args.T_max, args.eta_min) else: raise NotImplementedError(args.lr_scheduler) # optimizer opt = Momentum(params=get_param_groups(network), learning_rate=Tensor(lr), momentum=args.momentum, weight_decay=args.weight_decay, loss_scale=args.loss_scale) # loss if not args.label_smooth: args.label_smooth_factor = 0.0 loss = CrossEntropy(smooth_factor=args.label_smooth_factor, num_classes=args.num_classes) if args.is_dynamic_loss_scale == 1: loss_scale_manager = DynamicLossScaleManager(init_loss_scale=65536, scale_factor=2, scale_window=2000) else: loss_scale_manager = FixedLossScaleManager(args.loss_scale, drop_overflow_update=False) if args.platform == "Ascend": model = Model(network, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale_manager, metrics={'acc'}, amp_level="O3") else: model = Model(network, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale_manager, metrics={'acc'}, amp_level="O2") # checkpoint save progress_cb = ProgressMonitor(args) callbacks = [progress_cb,] if args.rank_save_ckpt_flag: ckpt_config = CheckpointConfig(save_checkpoint_steps=args.ckpt_interval * args.steps_per_epoch, keep_checkpoint_max=args.ckpt_save_max) ckpt_cb = ModelCheckpoint(config=ckpt_config, directory=args.outputs_dir, prefix='{}'.format(args.rank)) callbacks.append(ckpt_cb) model.train(args.max_epoch, de_dataset, callbacks=callbacks, dataset_sink_mode=True)
from pathlib import Path from typing import Any, Iterable, Optional, Union import numpy as np import torch from torch.utils.data.dataloader import DataLoader as PLDataloader from torch.utils.data.dataset import Dataset from tqdm import tqdm from configs import DefaultConfig from src.base.transformation import BaseTransformation from src.utils.checkers import image_folder from src.utils.logging import get_logger logger = get_logger("PrepareDataloader") class DataLoader(PLDataloader): def __init__( self, dataset: Iterable, config: DefaultConfig, ds_type: str, **kwargs, ): self.set_params(config=config, ds_type=ds_type) transformed_dataset = self.apply_tfms(dataset, ds_type) super().__init__( dataset=transformed_dataset, batch_size=self.batch_size, shuffle=self.shuffle, **kwargs, )
"""Load YAML config files""" import sys import os import yaml import src.utils.logging as logging logger = logging.get_logger(__name__) class ConfigLoader: @staticmethod def _load_yaml_content(fname): """Load and check content of a given YAML file name Args: fname: path to the config file Return: content: content of the YAML file """ assert os.path.isfile(fname), 'Config file not found: {}'.format(fname) with open(fname, 'r') as stream: try: content = yaml.safe_load(stream) except yaml.YAMLError as exc: logger.error(exc) sys.exit(-1) return content @staticmethod def load_model_cfg(fname):