def test_create_model(): # Verifying that create_model doesn't throw any errors model_binary = create_model("some_binary_model", ModelType.BINARY) model_masked = create_model("some_masked_model", ModelType.MASKED) assert model_binary.__asf_model_name == "some_binary_model" assert model_masked.__asf_model_name == "some_masked_model"
def __init__(self, project_parameters) -> None: self.project_parameters = project_parameters self.model = create_model(project_parameters=project_parameters).eval() if project_parameters.use_cuda: self.model = self.model.cuda() self.transform = get_transform_from_file( filepath=project_parameters.transform_config_path)['predict']
def train_model(dpath, ppath, epoch, version): if dpath.endswith(".csv"): d = pd.read_csv(dpath) else: raise ValueError("data format is not supported") pipe = joblib.load(ppath) encoder = Encoder(pipe) x = encoder.encode(d.iloc[:, 1:-1]) m = create_model( [ x.shape[1], ] ) m.fit(x, d.iloc[:, -1], batch_size=1000, epochs=epoch) m.save(f"model/{version}")
def train(project_parameters): seed_everything(seed=project_parameters.random_seed) if project_parameters.use_balance: project_parameters.data_weight = calculate_data_weight( classes=project_parameters.classes, data_path=project_parameters.data_path) data_module = DataModule(project_parameters=project_parameters) model = create_model(project_parameters=project_parameters) trainer = _get_trainer(project_parameters=project_parameters) trainer.fit(model=model, datamodule=data_module) result = {'trainer': trainer, 'model': model} trainer.callback_connector.configure_progress_bar().disable() for stage, data_loader in data_module.get_data_loaders().items(): print('\ntest the {} dataset'.format(stage)) print('the {} dataset confusion matrix:'.format(stage)) result[stage] = trainer.test(test_dataloaders=data_loader) trainer.callback_connector.configure_progress_bar().enable() return result
def train_wrapper(args: Namespace) -> None: """Function for training a network""" data_type = dataset_type(args.dataset) model_name = args.model if args.cont: model = load_model(model_name) history = model.__asf_model_history else: model_path = path_from_model_name(model_name) if not args.overwrite and os.path.isfile(model_path): print(f"File {model_name} already exists!") return model = create_model(model_name, data_type) history = {"loss": [], "acc": [], "val_loss": [], "val_acc": []} if model_type(model) != data_type: print("ERROR: This dataset is not compatible with your model") return train_model(model, history, args.dataset, args.epochs)
def main(cfg): """ Performs evaluation. """ cfg.cuda = torch.cuda.is_available() assert cfg.ckpt_path is not None, \ 'ckpt_path must be given' model_dir = abspath(dirname(cfg.ckpt_path)) output_dir = os.getcwd() device = torch.device('cuda' if cfg.cuda else 'cpu') output_path = join(output_dir, 'results.ner') labels_path = join(model_dir, 'labels.json') \ if cfg.labels_path is None else \ cfg.labels_path with open(labels_path, 'r') as fh: label2id = json.load(fh) id2label = {v: k for k, v in label2id.items()} xlmr = create_pretrained(cfg.model_type, cfg.force_download) encode_fn = functools.partial(encode_example, xlmr=xlmr, label2id=label2id) decode_fn = functools.partial(decode_example, xlmr=xlmr, id2label=id2label) model = create_model(xlmr, len(label2id), cfg) model.to(device) state_dict = torch.load(cfg.ckpt_path, map_location=device) model.load_state_dict(state_dict['model']) model.eval() def to_list(tensor): """ Converts the provided tensor to a python list. """ return tensor.cpu().numpy().tolist() def to_torch(tensor): """ Converts the provided tf array to torch tensor. """ return torch.from_numpy(tensor.numpy()).to(device) pad_id = xlmr.task.dictionary.pad() dataset = create_jsonl_loader(cfg.batch_size, cfg.eval_data_path, encode_fn, pad_id) print() print('***** Running evaluation *****') print() results = [] with torch.no_grad(): for batch in tqdm(dataset, leave=False): input_ids, label_ids = batch input_ids = to_torch(input_ids).long() label_ids = to_torch(label_ids).long() logits = model(input_ids) pred_ids = logits.argmax(dim=-1) lists = zip(to_list(pred_ids), to_list(label_ids), to_list(input_ids)) for pred_list, label_list, token_list in lists: pred_list = [(pred if label != -1 else -1) for pred, label in zip(pred_list, label_list)] tokens, labels = decode_fn(token_list, label_list) _, preds = decode_fn(token_list, pred_list) results.append((tokens, labels, preds)) outputs = [] for result in results: outputs.append('\n'.join('{} {} {}'.format(*values) for values in zip(*result))) with open(output_path, 'w') as fh: fh.write('\n\n'.join(outputs)) command = '{} < {}'.format(join(PROJECT_DIR, 'scripts', 'conlleval'), output_path) result = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT) result = result.decode('utf-8') print(result) stats_path = join(output_dir, 'results.txt') with open(stats_path, 'w') as fh: config_str = try_load_config(model_dir) if config_str is not None: print(yaml.dump(config_str), file=fh) print(result, file=fh)
def main(): args = setup_eval_args() args.distributed = False args.cuda = not args.no_cuda and \ torch.cuda.is_available() if args.seed is not None: set_random_seed(args) device = torch.device('cuda' if args.cuda else 'cpu') assert args.name is not None, \ '`--name` must be given' model_dir = join(args.model_dir, args.model, args.name) model_path = args.model_file if \ args.model_file else \ join(model_dir, args.ckpt_name + '.pt') state_dict = torch.load(model_path, map_location=device) del state_dict['optimizer'] tokenizer = create_tokenizer(args) vocab_size = len(tokenizer) model = create_model(args, model_dir, vocab_size) model = model.to(device) try: model.load_state_dict(state_dict.pop('model')) model.eval() except RuntimeError as e: print('The provided checkpoint has mismatching ' 'weights in the parameter dict.') print('WARNING: If the model was trained with ' '`--grad_ckpt` you also have to provide ' 'this argument for this script.') sys.exit() print() print(tabulate(state_dict.items(), tablefmt='presto')) print() history = [] select_fn = METHODS[args.decoding] special_ids = tokenizer.convert_tokens_to_ids([ SP1, SP2, tokenizer.bos_token, tokenizer.eos_token, HST, RSP, ]) @torch.no_grad() def respond(text): """ Responds to the given text. """ history.append(tokenizer.encode(text)) inputs = transform_dialog(history[-args.max_hist:], special_ids=special_ids, max_len=args.max_len) input_ids, type_ids = inputs inputs = [[input_ids], [type_ids]] preds = decode(args=args, model=model, inputs=inputs, tokenizer=tokenizer, select_fn=select_fn, device=device)[0] history.append(preds) # last token is the end token return tokenizer.decode(preds) print('Type a sentence for response. ' + 'CTRL + C to escape.') while True: try: print() text = input('User: '******'Bot: {}'.format(output)) except KeyboardInterrupt: break
def main(): """ Performs training, validation and testing. """ args = setup_train_args() args.cuda = torch.cuda.is_available() \ and not args.no_cuda model_dir = join(args.model_dir, args.model, args.name) os.makedirs(model_dir, exist_ok=True) logger = create_logger(model_dir=model_dir) if args.mixed and not APEX_INSTALLED: logger.warn('--mixed passed but apex is not installed.') args.mixed = args.mixed and APEX_INSTALLED \ and args.cuda master_process = args.local_rank in [0, -1] args.distributed = args.local_rank > 0 if args.distributed: # use distributed training if local rank is given # and GPU training is requested torch.cuda.set_device(args.local_rank) device = torch.device('cuda', args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://', rank=args.local_rank) else: device = torch.device('cuda' if args.cuda else 'cpu') # creating dataset and storing dataset splits # as individual variables for convenience datasets, tokenizer = create_dataset(args=args, master_process=master_process) pad_idx = tokenizer.convert_tokens_to_ids(tokenizer.pad_token) vocab_size = len(tokenizer) # TODO fix xlnet nan with mixed precision if 'xlnet' in args.model: args.mixed = False model = create_model(args=args, model_dir=model_dir, vocab_size=vocab_size) model = model.to(device) optimizer = create_optimizer(args=args, parameters=model.parameters()) if master_process: writer = SummaryWriter(logdir=model_dir, flush_secs=100) # loading previous state of the training best_val_loss, init_epoch, step = load_state(model_dir=model_dir, model=model, optimizer=optimizer, logger=logger, device=device) if args.mixed: model, optimizer = amp.initialize(model, optimizer, opt_level='O2') if args.distributed: model = DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) world_size = int(os.environ.get('WORLD_SIZE', 1)) train, valid, test = [(split, ceil(size / args.batch_size / world_size)) for split, size in datasets] # computing the sizes of the dataset splits train_dataset, num_train_steps = train valid_dataset, num_valid_steps = valid test_dataset, num_test_steps = test patience, skip, loss, acc = 0, 0, 0, 0 def reduce_tensor(tensor): """ Averages a tensor across gpus. """ reduced = tensor.clone() all_reduce(reduced, op=ReduceOp.SUM) reduced /= world_size return reduced def forward_step(batch): """ Applies forward pass with the given batch. """ inputs, targets = batch outputs = model(inputs=inputs, half=args.mixed) # converting targets from ndarray targets = torch.as_tensor(targets) targets = targets.long().to(device) loss, accuracy = compute_loss(outputs=outputs, targets=targets, ignore_idx=pad_idx) if args.distributed: # reducing accuracy accross devices # for more accurate logging accuracy = reduce_tensor(accuracy) return loss, accuracy.item() def train_step(batch): """ Performs a single step of training. """ nonlocal step, skip loss, accuracy = forward_step(batch) if torch.isnan(loss).item(): logger.debug('skipping step (nan)') # returning None values when a NaN loss # is encountered and skipping backprop # so model grads will not be corrupted skip += 1 return None, None loss /= args.grad_accum_steps backward(loss) clip_grad_norm(1.0) step += 1 if step % args.grad_accum_steps == 0: optimizer.step() optimizer.zero_grad() if args.distributed: # reducing loss accross devices for # more accurate logging loss = reduce_tensor(loss) return loss.item(), accuracy def backward(loss): """ Backpropagates the loss in either mixed or normal precision mode. """ # cuda is required for mixed precision training. if args.mixed: with amp.scale_loss(loss, optimizer) as scaled: scaled.backward() else: loss.backward() def clip_grad_norm(max_norm): """ Applies gradient clipping. """ if args.mixed: clip_grad_norm_(amp.master_params(optimizer), max_norm) else: clip_grad_norm_(model.parameters(), max_norm) def evaluate(dataset, num_steps): """ Constructs a validation loader and evaluates the model. """ loop = tqdm(dataset(), total=num_steps, disable=not master_process, desc='Eval') model.eval() for batch in loop: loss, acc = forward_step(batch) loop.set_postfix( ordered_dict=OrderedDict(loss=loss.item(), acc=acc)) yield loss.item() def save_state(): """ Saves the model and optimizer state. """ model_path = join(model_dir, 'model.pt') state = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'val_loss': best_val_loss, 'epoch': epoch + 1, 'step': step } logger.info('Saving model to {}'.format(model_path)) # making sure the model saving is not left in a # corrupted state after a keyboard interrupt while True: try: torch.save(state, model_path) break except KeyboardInterrupt: pass scheduler = LambdaLR(optimizer, compute_lr) if master_process: logger.info(str(vars(args))) for epoch in range(init_epoch, args.max_epochs): # running training loop loop = tqdm(train_dataset(), total=num_train_steps, disable=not master_process, desc='Train {}'.format(epoch)) train_loss = [] model.train() for batch in loop: try: loss, acc = train_step(batch) if master_process and loss is not None: train_loss.append(loss) # logging to tensorboard writer.add_scalar('train/loss', loss, step) writer.add_scalar('train/acc', acc, step) if not step % args.eval_every_step: with torch.no_grad(): val_loss = mean( evaluate(dataset=valid_dataset, num_steps=num_valid_steps)) # switching back to training model.train() if master_process: logger.info('val loss: {:.4}'.format(val_loss)) # logging to tensorboard writer.add_scalar('val/loss', val_loss, step) if val_loss < best_val_loss: patience = 0 best_val_loss = val_loss if master_process: save_state() else: patience += 1 if patience == args.patience: # terminate when max patience # level is hit break except RuntimeError as e: if 'out of memory' in str(e): logger.debug('skipping step (oom)') skip += 1 loop.set_postfix( ordered_dict=OrderedDict(loss=loss, acc=acc, skip=skip)) if len(train_loss) > 0: train_loss = mean(train_loss) else: train_loss = 0.0 if master_process: logger.info('train loss: {:.4}'.format(train_loss)) scheduler.step() if master_process: writer.close() with torch.no_grad(): test_loss = mean( evaluate(dataset=test_dataset, num_steps=num_test_steps)) if master_process: logger.info('test loss: {:.4}'.format(test_loss))
def main(): """ Performs training, validation and testing. """ args = setup_train_args() if args.notebook: from tqdm import tqdm_notebook as tqdm else: from tqdm import tqdm # if config is provided, then load it if args.config is not None: with open(args.config, 'r') as fh: config = json.load(fh) for arg in config: setattr(args, arg, config[arg]) args.cuda = torch.cuda.is_available() \ and not args.no_cuda # setting random seed for reproducibility if args.seed: set_random_seed(args) model_dir = join(args.model_dir, args.model, args.name) os.makedirs(model_dir, exist_ok=True) logger = create_logger(model_dir=model_dir) if args.fp16 and not APEX_INSTALLED: logger.warn('--fp16 passed but apex is not installed.') args.fp16 = args.fp16 and APEX_INSTALLED \ and args.cuda master_process = args.local_rank in [0, -1] args.distributed = args.local_rank != -1 if args.distributed: # use distributed training if local rank is given # and GPU training is requested torch.cuda.set_device(args.local_rank) device = torch.device('cuda', args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://', rank=args.local_rank) else: device = torch.device('cuda' if args.cuda else 'cpu') # creating dataset and storing dataset splits # as individual variables for convenience if args.distributed: # creating the dataset and model only on # a single process ( downloading ) if master_process: _, tokenizer, _ = create_dataset(args, master_process) vocab_size = len(tokenizer) create_model(args, model_dir, vocab_size) # other threads are waiting for the data init barrier() datasets, tokenizer, max_len = create_dataset( args=args, master_process=master_process) pad_idx = tokenizer.convert_tokens_to_ids(tokenizer.pad_token) vocab_size = len(tokenizer) model = create_model(args, model_dir, vocab_size) model = model.to(device) # TODO fix xlnet nan with mixed precision if 'xlnet' in args.model: args.fp16 = False optimizer = create_optimizer(args=args, parameters=model.parameters()) if master_process: writer = SummaryWriter(logdir=model_dir, flush_secs=100) # loading previous state of the training best_valid_loss, init_epoch, step = load_state(model_dir=model_dir, model=model, optimizer=optimizer, logger=logger, device=device) if args.fp16: model, optimizer = amp.initialize(model, optimizer, opt_level='O2') d_model = model.config.d_model if 'xlnet' in \ args.model else model.config.n_embd if args.distributed: model = DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) world_size = int(os.environ.get('WORLD_SIZE', 1)) train, valid, test = [(split, ceil(size / args.batch_size / world_size)) for split, size in datasets] # computing the sizes of the dataset splits train_dataset, num_train_steps = train valid_dataset, num_valid_steps = valid test_dataset, num_test_steps = test patience, skip, loss, accuracy = 0, 1, 0, 0 set_lr_fn = partial(set_lr, optimizer=optimizer, schedule=args.schedule, lr=args.lr, warmup_steps=args.warmup_steps, d_model=d_model) if master_process: # loading history for training logs history_path = join(model_dir, 'history.json') history = defaultdict(list) # NOTE the hardcoded values to keep track of # in the history metrics = ['loss', 'acc', 'ppl'] headers = ['epoch'] + \ ['train_' + m for m in metrics] + \ ['valid_' + m for m in metrics] if exists(history_path): with open(history_path, 'r') as fh: history = json.load(fh) def print_results(results): """ Prints the history to the standard output. """ data = list(zip(*[history[h] for h in headers])) table = tabulate(tabular_data=data, headers=headers, floatfmt='.3f') # computing the tabular table string and # printing only the last element print(table.split('\n')[-1]) msg = ', '.join('{}: {}'.format(n, r) for n, r in results.items()) logger.info(msg) def record_history(results): """ Records the results and prints them. """ # saving history and handling unexpected # keyboard interrupt for header in headers: history[header].append(results[header]) while True: try: with open(history_path, 'w') as fh: json.dump(history, fh) break except KeyboardInterrupt: pass @contextmanager def skip_error(): """ Convenience function for skipping errors. """ nonlocal skip try: # checking out of memory error and # proceeding if only a single GPU # is used for the training yield except RuntimeError as e: if 'out of memory' in str(e): if args.distributed: raise e skip += 1 def reduce_tensor(tensor): """ Averages a tensor across gpus. """ reduced = tensor.clone() all_reduce(reduced, op=ReduceOp.SUM) reduced /= world_size return reduced def forward_step(batch): """ Applies forward pass with the given batch. """ inputs, targets = batch outputs = model(inputs, half=args.fp16) # converting targets from ndarray targets = torch.as_tensor(targets) targets = targets.long().to(device) loss, acc, ppl = compute_loss(outputs=outputs, targets=targets, ignore_idx=pad_idx) if args.distributed: # reducing accuracy accross devices # for more accurate logging acc = reduce_tensor(acc) return loss, acc.item(), ppl def train_step(batch): """ Performs a single step of training. """ nonlocal step, skip loss, acc, ppl = forward_step(batch) if torch.isnan(loss).item(): # during distributed training NaN # values are not handled if args.distributed: raise ValueError('NaN values encountered.') logger.debug('skipping step (nan)') # returning None values when a NaN loss # is encountered and skipping backprop # so model grads will not be corrupted skip += 1 return None, None loss /= args.grad_accum_steps backward(loss) if args.clip_grad is not None: clip_grad_norm(args.clip_grad) if step % args.grad_accum_steps == 0: set_lr_fn(step) optimizer.step() optimizer.zero_grad() if args.distributed: # reducing loss accross devices for # more accurate logging loss = reduce_tensor(loss) step += 1 return {'loss': loss.item(), 'acc': acc, 'ppl': ppl} def backward(loss): """ Backpropagates the loss in either mixed or normal precision mode. """ # cuda is required for mixed precision training. if args.fp16: with amp.scale_loss(loss, optimizer) as scaled: scaled.backward() else: loss.backward() def clip_grad_norm(max_norm): """ Applies gradient clipping. """ if args.fp16: clip_grad_norm_(amp.master_params(optimizer), max_norm) else: clip_grad_norm_(model.parameters(), max_norm) def evaluate(dataset, num_steps): """ Constructs a validation loader and evaluates the model. """ loop = tqdm(dataset(), 'eval', num_steps, False, disable=not master_process) model.eval() for batch in loop: with skip_error(): loss, accuracy, ppl = forward_step(batch) loop.set_postfix( OrderedDict(loss=loss.item(), ppl=ppl, acc=accuracy)) yield loss.item(), accuracy, ppl def save_state(name): """ Saves the model and optimizer state. """ model_path = join(model_dir, name + '.pt') state = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'best_valid_loss': best_valid_loss, 'valid_loss': valid_loss, 'epoch': epoch + 1, 'step': step } logger.info('Saving model to {}'.format(model_path)) # making sure the model saving is not left in a # corrupted state after a keyboard interrupt while True: try: torch.save(state, model_path) break except KeyboardInterrupt: pass if master_process: train_args = vars(args) logger.info(str(train_args)) print() print(tabulate(train_args.items(), tablefmt='presto')) print() try: # initializing cuda buffer to avoid OOM errors dummy_batch = create_dummy_batch(args, ignore_idx=pad_idx) train_step(dummy_batch) except (RuntimeError, ValueError) as e: if 'out of memory' in str(e): msg = 'Not enough memory, there might ' + \ 'be several out of memory error during ' + \ 'training. To avoid this lower ' + \ 'the `--batch_size` or `--max_len`' if not args.grad_ckpt: msg += ', use the `--checkpointed` flag' if not APEX_INSTALLED: msg += ' or install apex for fp16 precision' logger.info(msg + '.') if args.distributed: return # creating table of history with correctly # arranged values for each header if master_process: table = list(zip(*[history[h] for h in headers])) print(tabulate(table, headers, floatfmt='.3f')) for epoch in range(init_epoch, args.max_epochs): # running training loop loop = tqdm(train_dataset(), 'train {}'.format(epoch), num_train_steps, False, disable=not master_process) train_metrics = defaultdict(list) model.train() for batch in loop: with skip_error(): results = train_step(batch) loss = results['loss'] if master_process and loss is not None: # adding the results to history # and logging them to tensorboard for metric, value in results.items(): train_metrics[metric].append(value) if value == float('inf'): value = 1e30 writer.add_scalar('train/' + metric, value, step) loop.set_postfix(OrderedDict(**results, skip=skip)) train_metrics = { 'train_' + metric: mean(values) if len(values) > 0 else 0.0 for metric, values in train_metrics.items() } with torch.no_grad(): valid_metrics = zip( *evaluate(dataset=valid_dataset, num_steps=num_valid_steps)) valid_loss, valid_acc, valid_ppl = [ mean(values) if len(values) > 0 else 0.0 for values in valid_metrics ] # switching back to training model.train() if master_process: results = {'epoch': epoch} results.update(train_metrics) results.update({ 'valid_loss': valid_loss, 'valid_acc': valid_acc, 'valid_ppl': valid_ppl }) record_history(results) print_results(results) # converting ppl to a large number so tensorboard # will not throw any warnings during training if valid_ppl == float('inf'): valid_ppl = 1e30 # logging to tensorboard writer.add_scalar('val/loss', valid_loss, step) writer.add_scalar('val/acc', valid_acc, step) writer.add_scalar('val/ppl', valid_ppl, step) if master_process: save_state(name='last') if valid_loss < best_valid_loss: patience = 0 best_valid_loss = valid_loss if master_process: save_state(name='best') else: patience += 1 if patience == args.patience: # terminate when max patience # level is hit break if step == args.total_steps: break if master_process: writer.close() with torch.no_grad(): test_metrics = zip( *evaluate(dataset=test_dataset, num_steps=num_test_steps)) test_loss, test_acc, test_ppl = [ mean(values) if len(values) > 0 else 0.0 for values in test_metrics ] if master_process: logger.info('test loss: {:.4}'.format(test_loss))
# -*- coding: utf-8 -*- import tensorflow as tf from src.model import create_model from src.tf_data import process_path, prepare_for_training DATASET_ROOT = "C://Users//penny//git//dataset//cifar100//train" if __name__ == '__main__': # 1. Define class names CLASS_NAMES = np.array( [item.name for item in pathlib.Path(DATASET_ROOT).glob('*')]) # 2. Build train_dataset files_ds = tf.data.Dataset.list_files(DATASET_ROOT + '/*/*') xy_ds = files_ds.map( lambda x: process_path(x, CLASS_NAMES, IMG_WIDTH, IMG_HEIGHT), num_parallel_calls=tf.data.experimental.AUTOTUNE) train_ds = prepare_for_training(xy_ds) model = create_model(n_classes=100, base_model_trainable=False) model.compile( optimizer=tf.keras.optimizers.Adam(lr=0.01), loss=tf.keras.losses.CategoricalCrossentropy(from_logits=False), metrics=['accuracy']) model.fit(train_ds, steps_per_epoch=100)
train_ids = all_ids[0:validation_split_point] validation_ids = all_ids[validation_split_point:test_split_point] test_ids = all_ids[test_split_point:dataset_size] title('Initialisation') log(f'TRAIN set size: {len(train_ids)}') log(f'VALIDATION set size: {len(validation_ids)}') log(f'TEST set size: {len(test_ids)}\n') # Data Generators training_generator = DataGenerator(available_ids=train_ids, **params) validation_generator = DataGenerator(available_ids=validation_ids, **params) test_generator = DataGenerator(available_ids=test_ids, **params) # Create keras model model = create_model(len(class_names)) epochs_count = len(training_generator) training_logger = TrainingLogger(epochs_count) # Print model architecture print(model.summary()) sys.stdout.flush() # Train model on dataset title('Training Model') model.fit_generator(generator=training_generator, validation_data=validation_generator, verbose=2, use_multiprocessing=(WORKERS > 0), workers=WORKERS, callbacks=[
def main(cfg): """ Performs training, validation and testing. """ assert isdir(cfg.data_dir), \ '`data_dir` must be a valid path.' cfg.cuda = torch.cuda.is_available() \ and not cfg.no_cuda cfg.model_dir = os.getcwd() # setting random seed for reproducibility if cfg.seed: set_random_seed(cfg) device = torch.device('cuda' if cfg.cuda else 'cpu') os.makedirs(cfg.model_dir, exist_ok=True) label2id = create_label2id(cfg) cfg.num_labels = len(label2id) xlmr = create_pretrained(cfg.model_type, cfg.force_download) # creating dataset split loaders datasets = create_dataset(cfg, xlmr, label2id) train_dataset, valid_dataset = datasets def compute_loss(batch): """ Computes the forward pass and returns the cross entropy loss. """ inputs, labels = [ torch.from_numpy(tensor).to(device).long() for tensor in batch ] logits = model(inputs) logits = logits.view(-1, logits.size(-1)) labels = labels.view(-1) loss = torch.nn.functional.cross_entropy(logits, labels, ignore_index=-1) return loss def train_step(engine, batch): """ Propagates the inputs forward and updates the parameters. """ step = engine.state.iteration model.train() loss = compute_loss(batch) backward(loss) if cfg.clip_grad_norm is not None: clip_grad_norm(cfg.clip_grad_norm) if step % cfg.grad_accum_steps == 0: optimizer.step() optimizer.zero_grad() scheduler.step() # restoring the averaged loss across steps loss *= cfg.grad_accum_steps return loss.item() def eval_step(engine, batch): """ Propagates the inputs forward without storing any gradients. """ model.eval() with torch.no_grad(): loss = compute_loss(batch) return loss.item() def backward(loss): """ Backpropagates the loss in either mixed or normal precision mode. """ if cfg.fp16: with amp.scale_loss(loss, optimizer) as sc: sc.backward() else: loss.backward() def clip_grad_norm(max_norm): """ Applies gradient clipping. """ if cfg.fp16: params = amp.master_params(optimizer) else: params = model.parameters() torch.nn.utils.clip_grad_norm_(params, max_norm) trainer = Engine(train_step) validator = Engine(eval_step) checkpoint = ModelCheckpoint( cfg.model_dir, cfg.model_type, n_saved=5, save_as_state_dict=True, score_function=lambda e: -e.state.metrics['loss']) last_ckpt_path = cfg.ckpt_path if last_ckpt_path is not None: msg = 'Loading state from {}' print(msg.format(basename(last_ckpt_path))) last_state = torch.load(last_ckpt_path, map_location=device) model = create_model(xlmr, len(label2id), cfg) model = model.to(device) del xlmr.model optimizer = create_optimizer(cfg, model) scheduler = create_scheduler(cfg, optimizer, len(train_dataset)) # using apex if required and loading its state if cfg.fp16: model, optimizer = amp.initialize(model, optimizer, opt_level='O2') if last_ckpt_path is not None and \ 'amp' in last_state: amp.load_state_dict(last_state['amp']) if last_ckpt_path is not None: model.load_state_dict(last_state['model']) optimizer.load_state_dict(last_state['optimizer']) scheduler.load_state_dict(last_state['scheduler']) checkpoint_dict = { 'model': model, 'optimizer': optimizer, 'scheduler': scheduler } if cfg.fp16: checkpoint_dict['amp'] = amp validator.add_event_handler(Events.COMPLETED, checkpoint, checkpoint_dict) metric = RunningAverage(output_transform=lambda x: x) metric.attach(trainer, 'loss') metric.attach(validator, 'loss') pbar = ProgressBar() pbar.attach(trainer, metric_names=['loss']) history_path = join(cfg.model_dir, 'history.json') history = collections.defaultdict(list) headers = ['epoch', 'train_loss', 'valid_loss'] if exists(history_path): with open(history_path, 'r') as fh: history = json.load(fh) def record_history(results): """ Records the results to the history. """ for header in headers: history[header].append(results[header]) with open(history_path, 'w') as fh: json.dump(history, fh) @trainer.on(Events.EPOCH_COMPLETED) def print_results(engine): """ Logs the training results. """ validator.run(valid_dataset) record_history({ 'epoch': engine.state.epoch, 'train_loss': engine.state.metrics['loss'], 'valid_loss': validator.state.metrics['loss'] }) data = list(zip(*[history[h] for h in headers])) table = tabulate(data, headers, floatfmt='.3f') print(table.split('\n')[-1]) data = list(zip(*[history[h] for h in headers])) print() print(cfg.pretty()) print() print('***** Running training *****') print() print(tabulate(data, headers, floatfmt='.3f')) trainer.run(train_dataset, cfg.max_epochs)
def main(cfg): """ Converts the model to onnx format. """ cfg.cuda = not cfg.no_cuda and \ torch.cuda.is_available() model_dir = abspath(dirname(cfg.ckpt_path)) output_dir = os.getcwd() device = torch.device('cuda' if cfg.cuda else 'cpu') os.makedirs(output_dir, exist_ok=True) labels_path = join(model_dir, 'labels.json') \ if cfg.labels_path is None else \ cfg.labels_path with open(labels_path, 'r') as fh: label2id = json.load(fh) xlmr = create_pretrained(cfg.model_type, cfg.force_download) encode_fn = functools.partial(encode_example, xlmr=xlmr, label2id=label2id) model = create_model(xlmr, len(label2id), cfg) model.to(device) state_dict = torch.load(cfg.ckpt_path, map_location=device) model.load_state_dict(state_dict['model']) model.eval() sample_input = xlmr.encode('Ez egy teszt') sample_input = sample_input[None, :].to(device) output_path = join(output_dir, cfg.model_type + '.onnx') torch.onnx.export(model, sample_input, output_path, export_params=True, do_constant_folding=True, input_names=['input'], output_names=['output'], dynamic_axes={ 'input': { 0: 'batch_size', 1: 'sequence' }, 'output': { 0: 'batch_size', 1: 'sequence' } }, verbose=True) print() print('***** Export *****') print() print('Model exported to {}.'.format(output_dir)) print() onnx_model = onnx.load(output_path) # only works with onnx 1.5 for some reason # 1.6 produces segmentation fault error onnx.checker.check_model(onnx_model)
shear_range=0.15, horizontal_flip=True, vertical_flip=True, fill_mode="nearest") train_data = train_gen.flow_from_directory(TRAIN_PATH, target_size=(150, 150), batch_size=64) val_gen = ImageDataGenerator(rescale=1 / 255.0) val_data = val_gen.flow_from_directory(VAL_PATH, target_size=(150, 150), batch_size=64) model = create_model(inputShape=(150, 150, 3)) history = model.fit(train_data, validation_data=val_data, epochs=20) plt.plot(history.history['loss'], label='train loss') plt.plot(history.history['val_loss'], label='val loss') plt.legend() plt.show() plt.savefig('LossVal_loss') # plot the accuracy plt.plot(history.history['accuracy'], label='train acc') plt.plot(history.history['val_accuracy'], label='val acc') plt.legend() plt.show()
# animate import numpy as np import matplotlib.pyplot as plt import matplotlib.animation as animation from src.features import cmvn,read_mp3,plot_signal,remove_silence, logmelspectrograms, plot_spectrogram import librosa import scipy from src.model import create_model, cachedir filename = 'fr_example_china.mp3' model = create_model( num_freq_bins=40, num_labels=len(lang2target)) _ = model.load_weights(os.path.join(cachedir, "model", model.name)) def sig2logspec(signal): logmelspec = logmelspectrograms(signal.reshape(1,-1), 16000) logmelspec_smn = cmvn(logmelspec) return logmelspec_smn.numpy()[0] def generate_demo_vid(filename, model=model, window = 16000*4, jump = 512): sound, rate = librosa.load(filename,sr=None) print(rate) print(len(sound)) print((len(sound)-window)//jump) fig, axs = plt.subplots(2)
from keras.callbacks import TensorBoard, EarlyStopping from datetime import datetime from src.model import create_model from keras.optimizers import Adam from src.import_data import * import matplotlib.pyplot as plt BATCH_SIZE = 64 EPOCHS = 30 train_images, train_age_labels, train_gender_labels, test_images, test_age_labels, test_gender_labels = get_data( ) model = create_model(HEIGHT, WIDTH, 8) model.compile(optimizer=Adam(learning_rate=0.001), loss={ "age": "sparse_categorical_crossentropy", "gender": "binary_crossentropy" }, metrics={ "age": "accuracy", "gender": "accuracy" }) callbacks = [ EarlyStopping(monitor='val_loss', mode="min", verbose=1, patience=5), TensorBoard(log_dir=os.path.join( "..\\logs\\", str(datetime.now().strftime("%b_%d_%Y_%H_%M_%S"))), histogram_freq=1, profile_batch=0)
def main(): args = setup_interact_args() args.distributed = False device = torch.device('cuda' if args.cuda else 'cpu') model_dir = join(args.model_dir, args.model_name) state_dict = torch.load(join(model_dir, 'model.pt'), map_location=device) _, tokenizer = create_dataset(args=args) vocab_size = len(tokenizer) model = create_model(args, vocab_size) model = model.to(device) model.load_state_dict(state_dict['model']) model.eval() history = [] select_fn = METHODS[args.method] special_ids = tokenizer.convert_tokens_to_ids([ SP1, SP2, tokenizer.bos_token, tokenizer.eos_token, HST, RSP, ]) @torch.no_grad() def respond(text): """ Responds to the given text. """ history.append(tokenizer.encode(text)) inputs = transform_dialog(history[:args.max_hist], special_ids=special_ids) preds = decode(args=args, model=model, inputs=inputs, tokenizer=tokenizer, select_fn=select_fn, device=device) history.append(preds) # last token is the end token return tokenizer.decode(preds[:-1]) print('Type a sentence to translate. ' + 'CTRL + C to escape.') while True: try: print() text = input() output = respond(text) print(output) print() except KeyboardInterrupt: break