def main(): parser = argparse.ArgumentParser() parser.add_argument('--device_id', type=int, default=1, help='which device the model will be trained on') args, model_settings = eval_config(parser) context.set_context(mode=context.GRAPH_MODE, device_target="Davinci", device_id=args.device_id) # Logger args.outputs_dir = os.path.join( args.log_path, datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S')) args.logger = get_logger(args.outputs_dir) # show args args.logger.save_args(args) # find model path if os.path.isdir(args.model_dir): models = list(glob.glob(os.path.join(args.model_dir, '*.ckpt'))) print(models) f = lambda x: -1 * int( os.path.splitext(os.path.split(x)[-1])[0].split('-')[0].split( 'epoch')[-1]) args.models = sorted(models, key=f) else: args.models = [args.model_dir] args.best_acc = 0 args.index = 0 args.best_index = 0 for model_path in args.models: test_de = audio_dataset(args.feat_dir, 'testing', model_settings['spectrogram_length'], model_settings['dct_coefficient_count'], args.per_batch_size) network = DSCNN(model_settings, args.model_size_info) load_ckpt(network, model_path, False) network.set_train(False) model = Model(network) args.logger.info('load model {} success'.format(model_path)) val(args, model, test_de) args.index += 1 args.logger.info('Best model:{} acc:{:.2f}%'.format( args.models[args.best_index], args.best_acc))
args_opt.dataset_path = os.path.abspath(args_opt.dataset_path) config = set_config(args_opt) start = time.time() print(f"train args: {args_opt}\ncfg: {config}") #set context and device init context_device_init(config) # define network backbone_net, head_net, net = define_net(config, args_opt.is_training) dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=True, config=config) step_size = dataset.get_dataset_size() if args_opt.pretrain_ckpt: if args_opt.freeze_layer == "backbone": load_ckpt(backbone_net, args_opt.pretrain_ckpt, trainable=False) step_size = extract_features(backbone_net, args_opt.dataset_path, config) else: load_ckpt(net, args_opt.pretrain_ckpt) if step_size == 0: raise ValueError("The step_size of dataset is zero. Check if the images' count of train dataset is more \ than batch_size in config.py") # Currently, only Ascend support switch precision. switch_precision(net, mstype.float16, config) # define loss if config.label_smooth > 0: loss = CrossEntropyWithLabelSmooth( smooth_factor=config.label_smooth, num_classes=config.num_classes) else:
from mindspore.common import dtype as mstype from src.dataset import create_dataset from src.config import set_config from src.args import eval_parse_args from src.models import define_net, load_ckpt from src.utils import switch_precision, set_context if __name__ == '__main__': args_opt = eval_parse_args() config = set_config(args_opt) backbone_net, head_net, net = define_net(config, args_opt.is_training) #load the trained checkpoint file to the net for evaluation if args_opt.head_ckpt: load_ckpt(backbone_net, args_opt.pretrain_ckpt) load_ckpt(head_net, args_opt.head_ckpt) else: load_ckpt(net, args_opt.pretrain_ckpt) set_context(config) switch_precision(net, mstype.float16, config) dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=False, config=config) step_size = dataset.get_dataset_size() if step_size == 0: raise ValueError( "The step_size of dataset is zero. Check if the images count of train dataset is more \ than batch_size in config.py")
type=str, choices=["AIR", "ONNX", "MINDIR"], default="AIR", help="file format") parser.add_argument('--platform', type=str, default="Ascend", choices=("Ascend", "GPU", "CPU"), help='run platform, only support GPU, CPU and Ascend') args = parser.parse_args() args.is_training = False args.run_distribute = False context.set_context(mode=context.GRAPH_MODE, device_target=args.platform) if args.platform == "Ascend": context.set_context(device_id=args.device_id) if __name__ == '__main__': cfg = set_config(args) set_context(cfg) _, _, net = define_net(cfg, args.is_training) load_ckpt(net, args.ckpt_file) input_shp = [args.batch_size, 3, cfg.image_height, cfg.image_width] input_array = Tensor( np.random.uniform(-1.0, 1.0, size=input_shp).astype(np.float32)) export(net, input_array, file_name=args.file_name, file_format=args.file_format)
# Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ """ mobilenetv2 export mindir. """ import numpy as np from mindspore import Tensor, export from src.config import set_config from src.args import export_parse_args from src.models import define_net, load_ckpt from src.utils import set_context if __name__ == '__main__': args_opt = export_parse_args() cfg = set_config(args_opt) set_context(cfg) _, _, net = define_net(cfg, args_opt.is_training) load_ckpt(net, args_opt.pretrain_ckpt) input_shp = [1, 3, cfg.image_height, cfg.image_width] input_array = Tensor( np.random.uniform(-1.0, 1.0, size=input_shp).astype(np.float32)) export(net, input_array, file_name=cfg.export_file, file_format=cfg.export_format)
def test_on_model(args): device = args.device if device == 'cpu': raise NotImplementedError("CPU training is not implemented.") device = torch.device(args.device) torch.cuda.set_device(device) # build model model = build_model(args) model.to(device) # output dir p_out = Path( args.p_out).joinpath(f"{model.name}-{args.tensorboard_exp_name}") if not p_out.exists(): p_out.mkdir(exist_ok=True, parents=True) # dataset & loader test_dataset = MTTDataset(path=args.p_data, split='test') test_loader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.n_workers, pin_memory=True, drop_last=False) # not dropping last in testing test_steps = test_dataset.calc_steps( args.batch_size, drop_last=False) # not dropping last in testing LOG.info(f"Total testing steps: {test_steps}") LOG.info(f"Testing data size: {len(test_dataset)}") # create loss loss_fn = get_loss(args.loss) # create metric metric = AUCMetric() # load checkpoint OR init state_dict if args.checkpoint is not None: state_dict = load_ckpt(args.checkpoint, reset_epoch=args.ckpt_epoch, no_scheduler=args.ckpt_no_scheduler, no_optimizer=args.ckpt_no_optimizer, no_loss_fn=args.ckpt_no_loss_fn, map_values=args.ckpt_map_values) model_dict = {'model': model} if 'model' in state_dict else None apply_state_dict(state_dict, model=model_dict) best_val_loss = state_dict['val_loss'] epoch = state_dict['epoch'] global_i = state_dict['global_i'] LOG.info( f"Checkpoint loaded. Epoch trained {epoch}, global_i {global_i}, best val {best_val_loss:.6f}" ) else: raise AssertionError("Pre-trained checkpoint must be provided.") # summary writer writer = SummaryWriter(log_dir=p_out.as_posix(), filename_suffix='-test') # start testing model.eval() sigmoid = Sigmoid().to(device) status_col = TextColumn("") running_loss = 0 if args.data_normalization: fetcher = DataPrefetcher(test_loader, mean=MTT_MEAN, std=MTT_STD, device=device) else: fetcher = DataPrefetcher(test_loader, mean=None, std=None, device=device) samples, targets = fetcher.next() with Progress("[progress.description]{task.description}", "[{task.completed}/{task.total}]", BarColumn(), "[progress.percentage]{task.percentage:>3.0f}%", TimeRemainingColumn(), TextColumn("/"), TimeElapsedColumn(), status_col, expand=False, console=CONSOLE, refresh_per_second=5) as progress: task = progress.add_task(description=f'[Test]', total=test_steps) i = 0 # counter t_start = time.time() with torch.no_grad(): while samples is not None: # forward model logits = model(samples) out = sigmoid(logits) test_loss = loss_fn(logits, targets) # collect running loss running_loss += test_loss.item() i += 1 writer.add_scalar('Test/Loss', running_loss / i, i) # auc metric metric.step(targets.cpu().numpy(), out.cpu().numpy()) # pre-fetch next samples samples, targets = fetcher.next() if not progress.finished: status_col.text_format = f"Test loss: {running_loss/i:.06f}" progress.update(task, advance=1) auc_tag, auc_sample, ap_tag, ap_sample = metric.auc_ap_score LOG.info(f"Testing speed: {(time.time() - t_start)/i:.4f}s/it, " f"auc_tag: {auc_tag:.04f}, " f"auc_sample: {auc_sample:.04f}, " f"ap_tag: {ap_tag:.04f}, " f"ap_sample: {ap_sample:.04f}") writer.close() return
def eval_on_model(args): device = args.device if device == 'cpu': raise NotImplementedError("CPU training is not implemented.") device = torch.device(args.device) torch.cuda.set_device(device) # build model model = build_model(args) model.to(device) # output dir p_out = Path( args.p_out).joinpath(f"{model.name}-{args.tensorboard_exp_name}") if not p_out.exists(): p_out.mkdir(exist_ok=True, parents=True) # dataset & loader annotation = pd.read_csv(args.annotation_file) query = annotation[annotation.mp3_path.str.match('/'.join( args.audio_file.split('/')[-2:]))] assert query.shape[0] != 0, f"Cannot find the audio file: {args.audio_file}" # split audio info and segment audio threshold = args.eval_threshold song_info = query[query.columns.values[50:]] tags = query.columns.values[:50] labels = query[tags].values[0] label_names = tags[labels.astype(bool)] segments = _segment_audio(_load_audio(args.audio_file, sample_rate=22050), n_samples=59049) LOG.info(f"Song info: {song_info}") LOG.info(f"Number of segments: {len(segments)}") LOG.info(f"Ground truth tags: {label_names}") LOG.info(f"Positive tag threshold: {threshold}") # create loss loss_fn = get_loss(args.loss) # load checkpoint OR init state_dict if args.checkpoint is not None: state_dict = load_ckpt(args.checkpoint, reset_epoch=args.ckpt_epoch, no_scheduler=args.ckpt_no_scheduler, no_optimizer=args.ckpt_no_optimizer, no_loss_fn=args.ckpt_no_loss_fn, map_values=args.ckpt_map_values) model_dict = {'model': model} if 'model' in state_dict else None apply_state_dict(state_dict, model=model_dict) best_val_loss = state_dict['val_loss'] epoch = state_dict['epoch'] global_i = state_dict['global_i'] LOG.info( f"Checkpoint loaded. Epoch trained {epoch}, global_i {global_i}, best val {best_val_loss:.6f}" ) else: raise AssertionError("Pre-trained checkpoint must be provided.") # start testing model.eval() sigmoid = Sigmoid().to(device) t_start = time.time() # concatenate segments segments = torch.from_numpy( np.concatenate([seg.reshape(1, 1, -1) for seg in segments ])).to(torch.float32).cuda(device=device) targets = torch.from_numpy(np.concatenate( [labels.reshape(1, -1)] * 10)).to(torch.float32).cuda(device=device) # forward pass with torch.no_grad(): logits = model(segments) out = sigmoid(logits) loss = loss_fn(logits, targets) out = out.cpu().numpy() out[out > threshold] = 1 out[out <= threshold] = 0 out = np.sum(out, axis=0) res = pd.DataFrame(data={'tags': tags, 'freq': out}) res = res[res.freq != 0].sort_values(by='freq', ascending=False) CONSOLE.print(res) LOG.info(f"Testing speed: {time.time() - t_start:.4f}s, " f"loss: {loss.item()}, ") return
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # =========================================================================== """DSCNN export.""" import argparse import numpy as np from mindspore import Tensor from mindspore.train.serialization import export from src.config import eval_config from src.ds_cnn import DSCNN from src.models import load_ckpt parser = argparse.ArgumentParser() args, model_settings = eval_config(parser) network = DSCNN(model_settings, args.model_size_info) load_ckpt(network, args.model_dir, False) x = np.random.uniform(0.0, 1.0, size=[ 1, 1, model_settings['spectrogram_length'], model_settings['dct_coefficient_count'] ]).astype(np.float32) export(network, Tensor(x), file_name=args.model_dir.replace('.ckpt', '.air'), file_format='AIR')
def train_on_model(args): if args.device == 'cpu': raise NotImplementedError("CPU training is not implemented.") device = torch.device(args.device) torch.cuda.set_device(device) # build model model = build_model(args) model.to(device) # output dir p_out = Path(args.p_out).joinpath(f"{model.name}-{args.tensorboard_exp_name}") if not p_out.exists(): p_out.mkdir(exist_ok=True, parents=True) # dataset & loader train_dataset = MTTDataset(path=args.p_data, split='train') val_dataset = MTTDataset(path=args.p_data, split='val') train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.n_workers, pin_memory=True, drop_last=True) val_loader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.n_workers, pin_memory=True, drop_last=True) train_steps = train_dataset.calc_steps(args.batch_size) val_steps = val_dataset.calc_steps(args.batch_size) if args.data_normalization: normalize = (MTT_MEAN, MTT_STD) LOG.info("Data normalization [bold cyan]on[/]") else: normalize = None LOG.info(f"Total training steps: {train_steps}") LOG.info(f"Total validation steps: {val_steps}") LOG.info(f"Training data size: {len(train_dataset)}") LOG.info(f"Validation data size: {len(val_dataset)}") # create optimizer optim = get_optimizer(model.parameters(), args=args) # create loss loss_fn = get_loss(args.loss) # creating scheduler scheduler_plateau = ReduceLROnPlateau(optim, factor=args.lr_decay_plateau, patience=args.plateau_patience, min_lr=args.min_lr, verbose=True, prefix="[Scheduler Plateau] ", logger=LOG) scheduler_es = EarlyStopping(patience=args.early_stop_patience, min_delta=args.early_stop_delta, verbose=True, prefix="[Scheduler Early Stop] ", logger=LOG) # load checkpoint OR init state_dict if args.checkpoint is not None: state_dict = load_ckpt(args.checkpoint, reset_epoch=args.ckpt_epoch, no_scheduler=args.ckpt_no_scheduler, no_optimizer=args.ckpt_no_optimizer, no_loss_fn=args.ckpt_no_loss_fn, map_values=args.ckpt_map_values) model_dict = {'model': model} if 'model' in state_dict else None optim_dict = {'optim': optim} if 'optim' in state_dict else None loss_fn_dict = {'loss_fn': loss_fn} if 'loss_fn' in state_dict else None scheduler_dict = {'scheduler_plateau': scheduler_plateau} \ if 'scheduler_plateau' in state_dict else None apply_state_dict(state_dict, model=model_dict, optimizer=optim_dict, loss_fn=loss_fn_dict, scheduler=scheduler_dict) best_val_loss = state_dict['val_loss'] epoch = state_dict['epoch'] global_i = state_dict['global_i'] LOG.info(f"Checkpoint loaded. Epoch trained {epoch}, global_i {global_i}, best val {best_val_loss:.6f}") else: # fresh training best_val_loss = 9999 epoch = 0 global_i = 0 # tensorboard purge_step = None if global_i == 0 else global_i writer = SummaryWriter(log_dir=VAR .log .joinpath('tensorboard') .joinpath(f"{model.name}-{args.tensorboard_exp_name}") .as_posix(), purge_step=purge_step, filename_suffix='-train') # train model for epochs assert epoch < args.max_epoch, "Initial epoch value must be smaller than max_epoch in order to train model" for i in range(epoch, args.max_epoch): # train init_lr = optim.param_groups[0]['lr'] train_loss, global_i = train_one_epoch(model, optim, loss_fn, train_loader, epoch+1, train_steps, device, writer, global_i, writer_interval=args.tensorboard_interval, normalize=normalize) # validate val_loss = evaluate(model, loss_fn, val_loader, epoch+1, val_steps, device, normalize=normalize) writer.add_scalar('Loss/Val', val_loss, global_i) epoch += 1 # update scheduler scheduler_plateau.step(val_loss) scheduler_es.step(val_loss) # save checkpoint if optim.param_groups[0]['lr'] != init_lr: LOG.info(f"Saving [red bold]checkpoint[/] at epoch {epoch}, model saved to {p_out.as_posix()}") torch.save({ 'model': model.state_dict(), 'optim': optim.state_dict(), 'loss_fn': loss_fn.state_dict(), 'scheduler_plateau': scheduler_plateau.state_dict(), 'scheduler_es': scheduler_es.state_dict(), 'epoch': epoch, 'loss': train_loss, 'val_loss': val_loss, 'p_out': p_out, 'global_i': global_i }, p_out.joinpath(f'ckpt@epoch-{epoch:03d}-loss-{val_loss:.6f}.tar').as_posix()) # save the best model if val_loss < best_val_loss: best_val_loss = val_loss LOG.info(f"New [red bold]best[/] validation loss {val_loss:.6f}, model saved to {p_out.as_posix()}") torch.save({ 'model': model.state_dict(), 'optim': optim.state_dict(), 'loss_fn': loss_fn.state_dict(), 'scheduler_plateau': scheduler_plateau.state_dict(), 'scheduler_es': scheduler_es.state_dict(), 'epoch': epoch, 'loss': train_loss, 'val_loss': val_loss, 'p_out': p_out, 'global_i': global_i }, p_out.joinpath(f'best@epoch-{epoch:03d}-loss-{val_loss:.6f}.tar').as_posix()) # save latest model else: torch.save({ 'model': model.state_dict(), 'optim': optim.state_dict(), 'loss_fn': loss_fn.state_dict(), 'scheduler_plateau': scheduler_plateau.state_dict(), 'scheduler_es': scheduler_es.state_dict(), 'epoch': epoch, 'loss': train_loss, 'val_loss': val_loss, 'p_out': p_out, 'global_i': global_i }, p_out.joinpath(f'latest.tar').as_posix()) # early stop, if enabled if scheduler_es.early_stop: break # if load optimal model when lr changed if optim.param_groups[0]['lr'] != init_lr and args.load_optimal_on_plateau: # save lr before restoring cur_lr = [param_group['lr'] for param_group in optim.param_groups] # restore last best model state_dict = find_optimal_model(p_out) apply_state_dict(state_dict, model={'model': model}, optimizer={'optim': optim}, loss_fn=None, scheduler=None) apply_lr(optim, cur_lr) # reset global_i global_i = state_dict['global_i'] epoch = state_dict['epoch'] LOG.info(f"Best model (val loss {state_dict['val_loss']}) applied. Roll back to epoch {epoch}") # reset tensorboard writer writer.close() writer = SummaryWriter(log_dir=VAR .log .joinpath('tensorboard') .joinpath(f"{model.name}-{args.tensorboard_exp_name}") .as_posix(), purge_step=global_i, filename_suffix='-train') # close tensorboard writer.close() return