コード例 #1
0
ファイル: test_model.py プロジェクト: gtmeier/AI_Water
def test_create_model():
    # Verifying that create_model doesn't throw any errors
    model_binary = create_model("some_binary_model", ModelType.BINARY)
    model_masked = create_model("some_masked_model", ModelType.MASKED)

    assert model_binary.__asf_model_name == "some_binary_model"
    assert model_masked.__asf_model_name == "some_masked_model"
コード例 #2
0
 def __init__(self, project_parameters) -> None:
     self.project_parameters = project_parameters
     self.model = create_model(project_parameters=project_parameters).eval()
     if project_parameters.use_cuda:
         self.model = self.model.cuda()
     self.transform = get_transform_from_file(
         filepath=project_parameters.transform_config_path)['predict']
コード例 #3
0
ファイル: cli.py プロジェクト: mzchifr/ml-production
def train_model(dpath, ppath, epoch, version):
    if dpath.endswith(".csv"):
        d = pd.read_csv(dpath)
    else:
        raise ValueError("data format is not supported")

    pipe = joblib.load(ppath)
    encoder = Encoder(pipe)
    x = encoder.encode(d.iloc[:, 1:-1])

    m = create_model(
        [
            x.shape[1],
        ]
    )
    m.fit(x, d.iloc[:, -1], batch_size=1000, epochs=epoch)
    m.save(f"model/{version}")
コード例 #4
0
def train(project_parameters):
    seed_everything(seed=project_parameters.random_seed)
    if project_parameters.use_balance:
        project_parameters.data_weight = calculate_data_weight(
            classes=project_parameters.classes,
            data_path=project_parameters.data_path)
    data_module = DataModule(project_parameters=project_parameters)
    model = create_model(project_parameters=project_parameters)
    trainer = _get_trainer(project_parameters=project_parameters)
    trainer.fit(model=model, datamodule=data_module)
    result = {'trainer': trainer, 'model': model}
    trainer.callback_connector.configure_progress_bar().disable()
    for stage, data_loader in data_module.get_data_loaders().items():
        print('\ntest the {} dataset'.format(stage))
        print('the {} dataset confusion matrix:'.format(stage))
        result[stage] = trainer.test(test_dataloaders=data_loader)
    trainer.callback_connector.configure_progress_bar().enable()
    return result
コード例 #5
0
def train_wrapper(args: Namespace) -> None:
    """Function for training a network"""
    data_type = dataset_type(args.dataset)
    model_name = args.model
    if args.cont:
        model = load_model(model_name)
        history = model.__asf_model_history
    else:
        model_path = path_from_model_name(model_name)
        if not args.overwrite and os.path.isfile(model_path):
            print(f"File {model_name} already exists!")
            return

        model = create_model(model_name, data_type)
        history = {"loss": [], "acc": [], "val_loss": [], "val_acc": []}

    if model_type(model) != data_type:
        print("ERROR: This dataset is not compatible with your model")
        return

    train_model(model, history, args.dataset, args.epochs)
コード例 #6
0
ファイル: eval.py プロジェクト: philip-dila/xlmr-finetuning
def main(cfg):
    """
    Performs evaluation.
    """
    cfg.cuda = torch.cuda.is_available()

    assert cfg.ckpt_path is not None, \
        'ckpt_path must be given'

    model_dir = abspath(dirname(cfg.ckpt_path))
    output_dir = os.getcwd()

    device = torch.device('cuda' if cfg.cuda else 'cpu')

    output_path = join(output_dir, 'results.ner')

    labels_path = join(model_dir, 'labels.json') \
        if cfg.labels_path is None else \
        cfg.labels_path

    with open(labels_path, 'r') as fh:
        label2id = json.load(fh)

    id2label = {v: k for k, v in label2id.items()}

    xlmr = create_pretrained(cfg.model_type, cfg.force_download)

    encode_fn = functools.partial(encode_example, xlmr=xlmr, label2id=label2id)

    decode_fn = functools.partial(decode_example, xlmr=xlmr, id2label=id2label)

    model = create_model(xlmr, len(label2id), cfg)
    model.to(device)

    state_dict = torch.load(cfg.ckpt_path, map_location=device)

    model.load_state_dict(state_dict['model'])
    model.eval()

    def to_list(tensor):
        """
        Converts the provided tensor to a python list.
        """
        return tensor.cpu().numpy().tolist()

    def to_torch(tensor):
        """
        Converts the provided tf array to torch
        tensor.
        """
        return torch.from_numpy(tensor.numpy()).to(device)

    pad_id = xlmr.task.dictionary.pad()
    dataset = create_jsonl_loader(cfg.batch_size, cfg.eval_data_path,
                                  encode_fn, pad_id)

    print()
    print('***** Running evaluation *****')
    print()

    results = []
    with torch.no_grad():
        for batch in tqdm(dataset, leave=False):
            input_ids, label_ids = batch

            input_ids = to_torch(input_ids).long()
            label_ids = to_torch(label_ids).long()

            logits = model(input_ids)

            pred_ids = logits.argmax(dim=-1)

            lists = zip(to_list(pred_ids), to_list(label_ids),
                        to_list(input_ids))

            for pred_list, label_list, token_list in lists:
                pred_list = [(pred if label != -1 else -1)
                             for pred, label in zip(pred_list, label_list)]

                tokens, labels = decode_fn(token_list, label_list)

                _, preds = decode_fn(token_list, pred_list)

                results.append((tokens, labels, preds))

    outputs = []
    for result in results:
        outputs.append('\n'.join('{} {} {}'.format(*values)
                                 for values in zip(*result)))

    with open(output_path, 'w') as fh:
        fh.write('\n\n'.join(outputs))

    command = '{} < {}'.format(join(PROJECT_DIR, 'scripts', 'conlleval'),
                               output_path)

    result = subprocess.check_output(command,
                                     shell=True,
                                     stderr=subprocess.STDOUT)

    result = result.decode('utf-8')

    print(result)

    stats_path = join(output_dir, 'results.txt')
    with open(stats_path, 'w') as fh:
        config_str = try_load_config(model_dir)
        if config_str is not None:
            print(yaml.dump(config_str), file=fh)
        print(result, file=fh)
コード例 #7
0
def main():
    args = setup_eval_args()

    args.distributed = False

    args.cuda = not args.no_cuda and \
        torch.cuda.is_available()

    if args.seed is not None:
        set_random_seed(args)

    device = torch.device('cuda' if args.cuda else 'cpu')

    assert args.name is not None, \
        '`--name` must be given'

    model_dir = join(args.model_dir, args.model, args.name)

    model_path = args.model_file if \
        args.model_file else \
        join(model_dir, args.ckpt_name + '.pt')

    state_dict = torch.load(model_path, map_location=device)

    del state_dict['optimizer']

    tokenizer = create_tokenizer(args)

    vocab_size = len(tokenizer)

    model = create_model(args, model_dir, vocab_size)
    model = model.to(device)

    try:
        model.load_state_dict(state_dict.pop('model'))
        model.eval()
    except RuntimeError as e:
        print('The provided checkpoint has mismatching '
              'weights in the parameter dict.')

        print('WARNING: If the model was trained with '
              '`--grad_ckpt` you also have to provide '
              'this argument for this script.')

        sys.exit()

    print()
    print(tabulate(state_dict.items(), tablefmt='presto'))
    print()

    history = []

    select_fn = METHODS[args.decoding]

    special_ids = tokenizer.convert_tokens_to_ids([
        SP1,
        SP2,
        tokenizer.bos_token,
        tokenizer.eos_token,
        HST,
        RSP,
    ])

    @torch.no_grad()
    def respond(text):
        """
        Responds to the given text.
        """
        history.append(tokenizer.encode(text))

        inputs = transform_dialog(history[-args.max_hist:],
                                  special_ids=special_ids,
                                  max_len=args.max_len)

        input_ids, type_ids = inputs
        inputs = [[input_ids], [type_ids]]

        preds = decode(args=args,
                       model=model,
                       inputs=inputs,
                       tokenizer=tokenizer,
                       select_fn=select_fn,
                       device=device)[0]

        history.append(preds)

        # last token is the end token
        return tokenizer.decode(preds)

    print('Type a sentence for response. ' + 'CTRL + C to escape.')

    while True:
        try:
            print()
            text = input('User: '******'Bot: {}'.format(output))

        except KeyboardInterrupt:
            break
コード例 #8
0
def main():
    """
    Performs training, validation and testing.
    """
    args = setup_train_args()

    args.cuda = torch.cuda.is_available() \
        and not args.no_cuda

    model_dir = join(args.model_dir, args.model, args.name)

    os.makedirs(model_dir, exist_ok=True)

    logger = create_logger(model_dir=model_dir)

    if args.mixed and not APEX_INSTALLED:
        logger.warn('--mixed passed but apex is not installed.')

    args.mixed = args.mixed and APEX_INSTALLED \
        and args.cuda

    master_process = args.local_rank in [0, -1]
    args.distributed = args.local_rank > 0

    if args.distributed:
        # use distributed training if local rank is given
        # and GPU training is requested
        torch.cuda.set_device(args.local_rank)
        device = torch.device('cuda', args.local_rank)

        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://',
                                             rank=args.local_rank)

    else:
        device = torch.device('cuda' if args.cuda else 'cpu')

    # creating dataset and storing dataset splits
    # as individual variables for convenience
    datasets, tokenizer = create_dataset(args=args,
                                         master_process=master_process)

    pad_idx = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
    vocab_size = len(tokenizer)

    # TODO fix xlnet nan with mixed precision
    if 'xlnet' in args.model:
        args.mixed = False

    model = create_model(args=args, model_dir=model_dir, vocab_size=vocab_size)

    model = model.to(device)

    optimizer = create_optimizer(args=args, parameters=model.parameters())

    if master_process:
        writer = SummaryWriter(logdir=model_dir, flush_secs=100)

    # loading previous state of the training
    best_val_loss, init_epoch, step = load_state(model_dir=model_dir,
                                                 model=model,
                                                 optimizer=optimizer,
                                                 logger=logger,
                                                 device=device)

    if args.mixed:
        model, optimizer = amp.initialize(model, optimizer, opt_level='O2')

    if args.distributed:
        model = DistributedDataParallel(model,
                                        device_ids=[args.local_rank],
                                        output_device=args.local_rank)

    world_size = int(os.environ.get('WORLD_SIZE', 1))

    train, valid, test = [(split, ceil(size / args.batch_size / world_size))
                          for split, size in datasets]

    # computing the sizes of the dataset splits
    train_dataset, num_train_steps = train
    valid_dataset, num_valid_steps = valid
    test_dataset, num_test_steps = test

    patience, skip, loss, acc = 0, 0, 0, 0

    def reduce_tensor(tensor):
        """
        Averages a tensor across gpus.
        """
        reduced = tensor.clone()
        all_reduce(reduced, op=ReduceOp.SUM)
        reduced /= world_size

        return reduced

    def forward_step(batch):
        """
        Applies forward pass with the given batch.
        """
        inputs, targets = batch

        outputs = model(inputs=inputs, half=args.mixed)

        # converting targets from ndarray
        targets = torch.as_tensor(targets)
        targets = targets.long().to(device)

        loss, accuracy = compute_loss(outputs=outputs,
                                      targets=targets,
                                      ignore_idx=pad_idx)

        if args.distributed:
            # reducing accuracy accross devices
            # for more accurate logging
            accuracy = reduce_tensor(accuracy)

        return loss, accuracy.item()

    def train_step(batch):
        """
        Performs a single step of training.
        """
        nonlocal step, skip

        loss, accuracy = forward_step(batch)

        if torch.isnan(loss).item():
            logger.debug('skipping step (nan)')
            # returning None values when a NaN loss
            # is encountered and skipping backprop
            # so model grads will not be corrupted
            skip += 1
            return None, None

        loss /= args.grad_accum_steps

        backward(loss)
        clip_grad_norm(1.0)

        step += 1

        if step % args.grad_accum_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        if args.distributed:
            # reducing loss accross devices for
            # more accurate logging
            loss = reduce_tensor(loss)

        return loss.item(), accuracy

    def backward(loss):
        """
        Backpropagates the loss in either mixed or
        normal precision mode.
        """
        # cuda is required for mixed precision training.
        if args.mixed:
            with amp.scale_loss(loss, optimizer) as scaled:
                scaled.backward()
        else:
            loss.backward()

    def clip_grad_norm(max_norm):
        """
        Applies gradient clipping.
        """
        if args.mixed:
            clip_grad_norm_(amp.master_params(optimizer), max_norm)
        else:
            clip_grad_norm_(model.parameters(), max_norm)

    def evaluate(dataset, num_steps):
        """
        Constructs a validation loader and evaluates
        the model.
        """
        loop = tqdm(dataset(),
                    total=num_steps,
                    disable=not master_process,
                    desc='Eval')

        model.eval()

        for batch in loop:
            loss, acc = forward_step(batch)

            loop.set_postfix(
                ordered_dict=OrderedDict(loss=loss.item(), acc=acc))

            yield loss.item()

    def save_state():
        """
        Saves the model and optimizer state.
        """
        model_path = join(model_dir, 'model.pt')

        state = {
            'model': model.state_dict(),
            'optimizer': optimizer.state_dict(),
            'val_loss': best_val_loss,
            'epoch': epoch + 1,
            'step': step
        }

        logger.info('Saving model to {}'.format(model_path))
        # making sure the model saving is not left in a
        # corrupted state after a keyboard interrupt
        while True:
            try:
                torch.save(state, model_path)
                break
            except KeyboardInterrupt:
                pass

    scheduler = LambdaLR(optimizer, compute_lr)

    if master_process:
        logger.info(str(vars(args)))

    for epoch in range(init_epoch, args.max_epochs):
        # running training loop
        loop = tqdm(train_dataset(),
                    total=num_train_steps,
                    disable=not master_process,
                    desc='Train {}'.format(epoch))

        train_loss = []

        model.train()

        for batch in loop:
            try:
                loss, acc = train_step(batch)

                if master_process and loss is not None:
                    train_loss.append(loss)

                    # logging to tensorboard
                    writer.add_scalar('train/loss', loss, step)
                    writer.add_scalar('train/acc', acc, step)

                if not step % args.eval_every_step:
                    with torch.no_grad():
                        val_loss = mean(
                            evaluate(dataset=valid_dataset,
                                     num_steps=num_valid_steps))

                    # switching back to training
                    model.train()

                    if master_process:
                        logger.info('val loss: {:.4}'.format(val_loss))

                        # logging to tensorboard
                        writer.add_scalar('val/loss', val_loss, step)

                    if val_loss < best_val_loss:
                        patience = 0
                        best_val_loss = val_loss

                        if master_process:
                            save_state()

                    else:
                        patience += 1
                        if patience == args.patience:
                            # terminate when max patience
                            # level is hit
                            break

            except RuntimeError as e:
                if 'out of memory' in str(e):
                    logger.debug('skipping step (oom)')
                    skip += 1

            loop.set_postfix(
                ordered_dict=OrderedDict(loss=loss, acc=acc, skip=skip))

        if len(train_loss) > 0:
            train_loss = mean(train_loss)
        else:
            train_loss = 0.0

        if master_process:
            logger.info('train loss: {:.4}'.format(train_loss))

        scheduler.step()

    if master_process:
        writer.close()

    with torch.no_grad():
        test_loss = mean(
            evaluate(dataset=test_dataset, num_steps=num_test_steps))

    if master_process:
        logger.info('test loss: {:.4}'.format(test_loss))
コード例 #9
0
def main():
    """
    Performs training, validation and testing.
    """
    args = setup_train_args()

    if args.notebook:
        from tqdm import tqdm_notebook as tqdm
    else:
        from tqdm import tqdm

    # if config is provided, then load it
    if args.config is not None:
        with open(args.config, 'r') as fh:
            config = json.load(fh)

        for arg in config:
            setattr(args, arg, config[arg])

    args.cuda = torch.cuda.is_available() \
        and not args.no_cuda

    # setting random seed for reproducibility
    if args.seed:
        set_random_seed(args)

    model_dir = join(args.model_dir, args.model, args.name)

    os.makedirs(model_dir, exist_ok=True)
    logger = create_logger(model_dir=model_dir)

    if args.fp16 and not APEX_INSTALLED:
        logger.warn('--fp16 passed but apex is not installed.')

    args.fp16 = args.fp16 and APEX_INSTALLED \
        and args.cuda

    master_process = args.local_rank in [0, -1]
    args.distributed = args.local_rank != -1

    if args.distributed:
        # use distributed training if local rank is given
        # and GPU training is requested
        torch.cuda.set_device(args.local_rank)
        device = torch.device('cuda', args.local_rank)

        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://',
                                             rank=args.local_rank)

    else:
        device = torch.device('cuda' if args.cuda else 'cpu')

    # creating dataset and storing dataset splits
    # as individual variables for convenience

    if args.distributed:
        # creating the dataset and model only on
        # a single process ( downloading )
        if master_process:
            _, tokenizer, _ = create_dataset(args, master_process)

            vocab_size = len(tokenizer)

            create_model(args, model_dir, vocab_size)

        # other threads are waiting for the data init
        barrier()

    datasets, tokenizer, max_len = create_dataset(
        args=args, master_process=master_process)

    pad_idx = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
    vocab_size = len(tokenizer)

    model = create_model(args, model_dir, vocab_size)
    model = model.to(device)

    # TODO fix xlnet nan with mixed precision
    if 'xlnet' in args.model:
        args.fp16 = False

    optimizer = create_optimizer(args=args, parameters=model.parameters())

    if master_process:
        writer = SummaryWriter(logdir=model_dir, flush_secs=100)

    # loading previous state of the training
    best_valid_loss, init_epoch, step = load_state(model_dir=model_dir,
                                                   model=model,
                                                   optimizer=optimizer,
                                                   logger=logger,
                                                   device=device)

    if args.fp16:
        model, optimizer = amp.initialize(model, optimizer, opt_level='O2')

    d_model = model.config.d_model if 'xlnet' in \
        args.model else model.config.n_embd

    if args.distributed:
        model = DistributedDataParallel(model,
                                        device_ids=[args.local_rank],
                                        output_device=args.local_rank)

    world_size = int(os.environ.get('WORLD_SIZE', 1))

    train, valid, test = [(split, ceil(size / args.batch_size / world_size))
                          for split, size in datasets]

    # computing the sizes of the dataset splits
    train_dataset, num_train_steps = train
    valid_dataset, num_valid_steps = valid
    test_dataset, num_test_steps = test

    patience, skip, loss, accuracy = 0, 1, 0, 0

    set_lr_fn = partial(set_lr,
                        optimizer=optimizer,
                        schedule=args.schedule,
                        lr=args.lr,
                        warmup_steps=args.warmup_steps,
                        d_model=d_model)

    if master_process:
        # loading history for training logs
        history_path = join(model_dir, 'history.json')

        history = defaultdict(list)

        # NOTE the hardcoded values to keep track of
        # in the history
        metrics = ['loss', 'acc', 'ppl']
        headers = ['epoch'] + \
            ['train_' + m for m in metrics] + \
            ['valid_' + m for m in metrics]

        if exists(history_path):
            with open(history_path, 'r') as fh:
                history = json.load(fh)

    def print_results(results):
        """
        Prints the history to the standard output.
        """
        data = list(zip(*[history[h] for h in headers]))

        table = tabulate(tabular_data=data, headers=headers, floatfmt='.3f')

        # computing the tabular table string and
        # printing only the last element
        print(table.split('\n')[-1])

        msg = ', '.join('{}: {}'.format(n, r) for n, r in results.items())

        logger.info(msg)

    def record_history(results):
        """
        Records the results and prints them.
        """
        # saving history and handling unexpected
        # keyboard interrupt
        for header in headers:
            history[header].append(results[header])

        while True:
            try:
                with open(history_path, 'w') as fh:
                    json.dump(history, fh)
                break
            except KeyboardInterrupt:
                pass

    @contextmanager
    def skip_error():
        """
        Convenience function for skipping errors.
        """
        nonlocal skip

        try:
            # checking out of memory error and
            # proceeding if only a single GPU
            # is used for the training
            yield

        except RuntimeError as e:
            if 'out of memory' in str(e):
                if args.distributed:
                    raise e
                skip += 1

    def reduce_tensor(tensor):
        """
        Averages a tensor across gpus.
        """
        reduced = tensor.clone()
        all_reduce(reduced, op=ReduceOp.SUM)
        reduced /= world_size

        return reduced

    def forward_step(batch):
        """
        Applies forward pass with the given batch.
        """
        inputs, targets = batch

        outputs = model(inputs, half=args.fp16)

        # converting targets from ndarray
        targets = torch.as_tensor(targets)
        targets = targets.long().to(device)

        loss, acc, ppl = compute_loss(outputs=outputs,
                                      targets=targets,
                                      ignore_idx=pad_idx)

        if args.distributed:
            # reducing accuracy accross devices
            # for more accurate logging
            acc = reduce_tensor(acc)

        return loss, acc.item(), ppl

    def train_step(batch):
        """
        Performs a single step of training.
        """
        nonlocal step, skip

        loss, acc, ppl = forward_step(batch)

        if torch.isnan(loss).item():
            # during distributed training NaN
            # values are not handled
            if args.distributed:
                raise ValueError('NaN values encountered.')

            logger.debug('skipping step (nan)')
            # returning None values when a NaN loss
            # is encountered and skipping backprop
            # so model grads will not be corrupted

            skip += 1
            return None, None

        loss /= args.grad_accum_steps

        backward(loss)

        if args.clip_grad is not None:
            clip_grad_norm(args.clip_grad)

        if step % args.grad_accum_steps == 0:
            set_lr_fn(step)
            optimizer.step()
            optimizer.zero_grad()

        if args.distributed:
            # reducing loss accross devices for
            # more accurate logging
            loss = reduce_tensor(loss)

        step += 1

        return {'loss': loss.item(), 'acc': acc, 'ppl': ppl}

    def backward(loss):
        """
        Backpropagates the loss in either mixed or
        normal precision mode.
        """
        # cuda is required for mixed precision training.
        if args.fp16:
            with amp.scale_loss(loss, optimizer) as scaled:
                scaled.backward()
        else:
            loss.backward()

    def clip_grad_norm(max_norm):
        """
        Applies gradient clipping.
        """
        if args.fp16:
            clip_grad_norm_(amp.master_params(optimizer), max_norm)
        else:
            clip_grad_norm_(model.parameters(), max_norm)

    def evaluate(dataset, num_steps):
        """
        Constructs a validation loader and evaluates
        the model.
        """
        loop = tqdm(dataset(),
                    'eval',
                    num_steps,
                    False,
                    disable=not master_process)

        model.eval()

        for batch in loop:
            with skip_error():
                loss, accuracy, ppl = forward_step(batch)

                loop.set_postfix(
                    OrderedDict(loss=loss.item(), ppl=ppl, acc=accuracy))

                yield loss.item(), accuracy, ppl

    def save_state(name):
        """
        Saves the model and optimizer state.
        """
        model_path = join(model_dir, name + '.pt')

        state = {
            'model': model.state_dict(),
            'optimizer': optimizer.state_dict(),
            'best_valid_loss': best_valid_loss,
            'valid_loss': valid_loss,
            'epoch': epoch + 1,
            'step': step
        }

        logger.info('Saving model to {}'.format(model_path))
        # making sure the model saving is not left in a
        # corrupted state after a keyboard interrupt
        while True:
            try:
                torch.save(state, model_path)
                break
            except KeyboardInterrupt:
                pass

    if master_process:
        train_args = vars(args)
        logger.info(str(train_args))

        print()
        print(tabulate(train_args.items(), tablefmt='presto'))
        print()

    try:
        # initializing cuda buffer to avoid OOM errors
        dummy_batch = create_dummy_batch(args, ignore_idx=pad_idx)

        train_step(dummy_batch)

    except (RuntimeError, ValueError) as e:
        if 'out of memory' in str(e):
            msg = 'Not enough memory, there might ' + \
                'be several out of memory error during ' + \
                'training. To avoid this lower ' + \
                'the `--batch_size` or `--max_len`'

            if not args.grad_ckpt:
                msg += ', use the `--checkpointed` flag'

            if not APEX_INSTALLED:
                msg += ' or install apex for fp16 precision'

            logger.info(msg + '.')

        if args.distributed:
            return

    # creating table of history with correctly
    # arranged values for each header
    if master_process:
        table = list(zip(*[history[h] for h in headers]))
        print(tabulate(table, headers, floatfmt='.3f'))

    for epoch in range(init_epoch, args.max_epochs):
        # running training loop
        loop = tqdm(train_dataset(),
                    'train {}'.format(epoch),
                    num_train_steps,
                    False,
                    disable=not master_process)

        train_metrics = defaultdict(list)

        model.train()

        for batch in loop:
            with skip_error():
                results = train_step(batch)

                loss = results['loss']
                if master_process and loss is not None:
                    # adding the results to history
                    # and logging them to tensorboard
                    for metric, value in results.items():
                        train_metrics[metric].append(value)

                        if value == float('inf'):
                            value = 1e30

                        writer.add_scalar('train/' + metric, value, step)

                loop.set_postfix(OrderedDict(**results, skip=skip))

        train_metrics = {
            'train_' + metric: mean(values) if len(values) > 0 else 0.0
            for metric, values in train_metrics.items()
        }

        with torch.no_grad():
            valid_metrics = zip(
                *evaluate(dataset=valid_dataset, num_steps=num_valid_steps))

        valid_loss, valid_acc, valid_ppl = [
            mean(values) if len(values) > 0 else 0.0
            for values in valid_metrics
        ]

        # switching back to training
        model.train()

        if master_process:
            results = {'epoch': epoch}

            results.update(train_metrics)

            results.update({
                'valid_loss': valid_loss,
                'valid_acc': valid_acc,
                'valid_ppl': valid_ppl
            })

            record_history(results)
            print_results(results)

            # converting ppl to a large number so tensorboard
            # will not throw any warnings during training
            if valid_ppl == float('inf'):
                valid_ppl = 1e30

            # logging to tensorboard
            writer.add_scalar('val/loss', valid_loss, step)
            writer.add_scalar('val/acc', valid_acc, step)
            writer.add_scalar('val/ppl', valid_ppl, step)

        if master_process:
            save_state(name='last')

        if valid_loss < best_valid_loss:
            patience = 0
            best_valid_loss = valid_loss

            if master_process:
                save_state(name='best')

        else:
            patience += 1
            if patience == args.patience:
                # terminate when max patience
                # level is hit
                break

        if step == args.total_steps:
            break

    if master_process:
        writer.close()

    with torch.no_grad():
        test_metrics = zip(
            *evaluate(dataset=test_dataset, num_steps=num_test_steps))

    test_loss, test_acc, test_ppl = [
        mean(values) if len(values) > 0 else 0.0 for values in test_metrics
    ]

    if master_process:
        logger.info('test loss: {:.4}'.format(test_loss))
コード例 #10
0
# -*- coding: utf-8 -*-

import tensorflow as tf
from src.model import create_model
from src.tf_data import process_path, prepare_for_training

DATASET_ROOT = "C://Users//penny//git//dataset//cifar100//train"

if __name__ == '__main__':
    # 1. Define class names
    CLASS_NAMES = np.array(
        [item.name for item in pathlib.Path(DATASET_ROOT).glob('*')])

    # 2. Build train_dataset
    files_ds = tf.data.Dataset.list_files(DATASET_ROOT + '/*/*')
    xy_ds = files_ds.map(
        lambda x: process_path(x, CLASS_NAMES, IMG_WIDTH, IMG_HEIGHT),
        num_parallel_calls=tf.data.experimental.AUTOTUNE)
    train_ds = prepare_for_training(xy_ds)

    model = create_model(n_classes=100, base_model_trainable=False)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(lr=0.01),
        loss=tf.keras.losses.CategoricalCrossentropy(from_logits=False),
        metrics=['accuracy'])
    model.fit(train_ds, steps_per_epoch=100)
コード例 #11
0
train_ids = all_ids[0:validation_split_point]
validation_ids = all_ids[validation_split_point:test_split_point]
test_ids = all_ids[test_split_point:dataset_size]

title('Initialisation')
log(f'TRAIN set size: {len(train_ids)}')
log(f'VALIDATION set size: {len(validation_ids)}')
log(f'TEST set size: {len(test_ids)}\n')

# Data Generators
training_generator = DataGenerator(available_ids=train_ids, **params)
validation_generator = DataGenerator(available_ids=validation_ids, **params)
test_generator = DataGenerator(available_ids=test_ids, **params)

# Create keras model
model = create_model(len(class_names))
epochs_count = len(training_generator)
training_logger = TrainingLogger(epochs_count)

# Print model architecture
print(model.summary())
sys.stdout.flush()

# Train model on dataset
title('Training Model')
model.fit_generator(generator=training_generator,
                    validation_data=validation_generator,
                    verbose=2,
                    use_multiprocessing=(WORKERS > 0),
                    workers=WORKERS,
                    callbacks=[
コード例 #12
0
ファイル: train.py プロジェクト: philip-dila/xlmr-finetuning
def main(cfg):
    """
    Performs training, validation and testing.
    """
    assert isdir(cfg.data_dir), \
        '`data_dir` must be a valid path.'

    cfg.cuda = torch.cuda.is_available() \
        and not cfg.no_cuda

    cfg.model_dir = os.getcwd()

    # setting random seed for reproducibility
    if cfg.seed: set_random_seed(cfg)

    device = torch.device('cuda' if cfg.cuda else 'cpu')

    os.makedirs(cfg.model_dir, exist_ok=True)

    label2id = create_label2id(cfg)
    cfg.num_labels = len(label2id)

    xlmr = create_pretrained(cfg.model_type, cfg.force_download)

    # creating dataset split loaders
    datasets = create_dataset(cfg, xlmr, label2id)

    train_dataset, valid_dataset = datasets

    def compute_loss(batch):
        """
        Computes the forward pass and returns the
        cross entropy loss.
        """
        inputs, labels = [
            torch.from_numpy(tensor).to(device).long() for tensor in batch
        ]

        logits = model(inputs)

        logits = logits.view(-1, logits.size(-1))
        labels = labels.view(-1)

        loss = torch.nn.functional.cross_entropy(logits,
                                                 labels,
                                                 ignore_index=-1)

        return loss

    def train_step(engine, batch):
        """
        Propagates the inputs forward and updates
        the parameters.
        """
        step = engine.state.iteration

        model.train()

        loss = compute_loss(batch)

        backward(loss)

        if cfg.clip_grad_norm is not None:
            clip_grad_norm(cfg.clip_grad_norm)

        if step % cfg.grad_accum_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
            scheduler.step()

        # restoring the averaged loss across steps
        loss *= cfg.grad_accum_steps

        return loss.item()

    def eval_step(engine, batch):
        """
        Propagates the inputs forward without
        storing any gradients.
        """
        model.eval()

        with torch.no_grad():
            loss = compute_loss(batch)

        return loss.item()

    def backward(loss):
        """
        Backpropagates the loss in either mixed or
        normal precision mode.
        """
        if cfg.fp16:
            with amp.scale_loss(loss, optimizer) as sc:
                sc.backward()

        else:
            loss.backward()

    def clip_grad_norm(max_norm):
        """
        Applies gradient clipping.
        """
        if cfg.fp16:
            params = amp.master_params(optimizer)
        else:
            params = model.parameters()

        torch.nn.utils.clip_grad_norm_(params, max_norm)

    trainer = Engine(train_step)
    validator = Engine(eval_step)

    checkpoint = ModelCheckpoint(
        cfg.model_dir,
        cfg.model_type,
        n_saved=5,
        save_as_state_dict=True,
        score_function=lambda e: -e.state.metrics['loss'])

    last_ckpt_path = cfg.ckpt_path

    if last_ckpt_path is not None:
        msg = 'Loading state from {}'
        print(msg.format(basename(last_ckpt_path)))

        last_state = torch.load(last_ckpt_path, map_location=device)

    model = create_model(xlmr, len(label2id), cfg)
    model = model.to(device)

    del xlmr.model

    optimizer = create_optimizer(cfg, model)

    scheduler = create_scheduler(cfg, optimizer, len(train_dataset))

    # using apex if required and loading its state
    if cfg.fp16:
        model, optimizer = amp.initialize(model, optimizer, opt_level='O2')

        if last_ckpt_path is not None and \
                'amp' in last_state:
            amp.load_state_dict(last_state['amp'])

    if last_ckpt_path is not None:
        model.load_state_dict(last_state['model'])
        optimizer.load_state_dict(last_state['optimizer'])
        scheduler.load_state_dict(last_state['scheduler'])

    checkpoint_dict = {
        'model': model,
        'optimizer': optimizer,
        'scheduler': scheduler
    }

    if cfg.fp16: checkpoint_dict['amp'] = amp

    validator.add_event_handler(Events.COMPLETED, checkpoint, checkpoint_dict)

    metric = RunningAverage(output_transform=lambda x: x)
    metric.attach(trainer, 'loss')
    metric.attach(validator, 'loss')

    pbar = ProgressBar()
    pbar.attach(trainer, metric_names=['loss'])

    history_path = join(cfg.model_dir, 'history.json')
    history = collections.defaultdict(list)
    headers = ['epoch', 'train_loss', 'valid_loss']

    if exists(history_path):
        with open(history_path, 'r') as fh:
            history = json.load(fh)

    def record_history(results):
        """
        Records the results to the history.
        """
        for header in headers:
            history[header].append(results[header])

        with open(history_path, 'w') as fh:
            json.dump(history, fh)

    @trainer.on(Events.EPOCH_COMPLETED)
    def print_results(engine):
        """
        Logs the training results.
        """
        validator.run(valid_dataset)

        record_history({
            'epoch': engine.state.epoch,
            'train_loss': engine.state.metrics['loss'],
            'valid_loss': validator.state.metrics['loss']
        })

        data = list(zip(*[history[h] for h in headers]))
        table = tabulate(data, headers, floatfmt='.3f')

        print(table.split('\n')[-1])

    data = list(zip(*[history[h] for h in headers]))

    print()
    print(cfg.pretty())

    print()
    print('***** Running training *****')

    print()
    print(tabulate(data, headers, floatfmt='.3f'))

    trainer.run(train_dataset, cfg.max_epochs)
コード例 #13
0
def main(cfg):
    """
    Converts the model to onnx format.
    """
    cfg.cuda = not cfg.no_cuda and \
        torch.cuda.is_available()

    model_dir = abspath(dirname(cfg.ckpt_path))
    output_dir = os.getcwd()

    device = torch.device('cuda' if cfg.cuda else 'cpu')

    os.makedirs(output_dir, exist_ok=True)

    labels_path = join(model_dir, 'labels.json') \
        if cfg.labels_path is None else \
        cfg.labels_path

    with open(labels_path, 'r') as fh:
        label2id = json.load(fh)

    xlmr = create_pretrained(cfg.model_type, cfg.force_download)

    encode_fn = functools.partial(encode_example, xlmr=xlmr, label2id=label2id)

    model = create_model(xlmr, len(label2id), cfg)
    model.to(device)

    state_dict = torch.load(cfg.ckpt_path, map_location=device)

    model.load_state_dict(state_dict['model'])
    model.eval()

    sample_input = xlmr.encode('Ez egy teszt')
    sample_input = sample_input[None, :].to(device)

    output_path = join(output_dir, cfg.model_type + '.onnx')

    torch.onnx.export(model,
                      sample_input,
                      output_path,
                      export_params=True,
                      do_constant_folding=True,
                      input_names=['input'],
                      output_names=['output'],
                      dynamic_axes={
                          'input': {
                              0: 'batch_size',
                              1: 'sequence'
                          },
                          'output': {
                              0: 'batch_size',
                              1: 'sequence'
                          }
                      },
                      verbose=True)

    print()
    print('***** Export *****')
    print()

    print('Model exported to {}.'.format(output_dir))
    print()

    onnx_model = onnx.load(output_path)
    # only works with onnx 1.5 for some reason
    # 1.6 produces segmentation fault error
    onnx.checker.check_model(onnx_model)
コード例 #14
0
                                   shear_range=0.15,
                                   horizontal_flip=True,
                                   vertical_flip=True,
                                   fill_mode="nearest")

    train_data = train_gen.flow_from_directory(TRAIN_PATH,
                                               target_size=(150, 150),
                                               batch_size=64)

    val_gen = ImageDataGenerator(rescale=1 / 255.0)

    val_data = val_gen.flow_from_directory(VAL_PATH,
                                           target_size=(150, 150),
                                           batch_size=64)

    model = create_model(inputShape=(150, 150, 3))
    history = model.fit(train_data,
                        validation_data=val_data,
                        epochs=20)

    plt.plot(history.history['loss'], label='train loss')
    plt.plot(history.history['val_loss'], label='val loss')
    plt.legend()
    plt.show()
    plt.savefig('LossVal_loss')

    # plot the accuracy
    plt.plot(history.history['accuracy'], label='train acc')
    plt.plot(history.history['val_accuracy'], label='val acc')
    plt.legend()
    plt.show()
コード例 #15
0
ファイル: demo.py プロジェクト: marcalph/slip
# animate
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from src.features import cmvn,read_mp3,plot_signal,remove_silence, logmelspectrograms, plot_spectrogram
import librosa
import scipy
from src.model import create_model, cachedir


filename = 'fr_example_china.mp3'


model = create_model(
        num_freq_bins=40,
        num_labels=len(lang2target))
_ = model.load_weights(os.path.join(cachedir, "model", model.name))

def sig2logspec(signal):
    logmelspec = logmelspectrograms(signal.reshape(1,-1), 16000)
    logmelspec_smn = cmvn(logmelspec)
    return logmelspec_smn.numpy()[0]


def generate_demo_vid(filename, model=model, window = 16000*4, jump = 512):
    sound, rate = librosa.load(filename,sr=None)
    print(rate)
    print(len(sound))
    print((len(sound)-window)//jump)
    fig, axs = plt.subplots(2)
コード例 #16
0
from keras.callbacks import TensorBoard, EarlyStopping
from datetime import datetime
from src.model import create_model
from keras.optimizers import Adam
from src.import_data import *
import matplotlib.pyplot as plt

BATCH_SIZE = 64
EPOCHS = 30

train_images, train_age_labels, train_gender_labels, test_images, test_age_labels, test_gender_labels = get_data(
)

model = create_model(HEIGHT, WIDTH, 8)

model.compile(optimizer=Adam(learning_rate=0.001),
              loss={
                  "age": "sparse_categorical_crossentropy",
                  "gender": "binary_crossentropy"
              },
              metrics={
                  "age": "accuracy",
                  "gender": "accuracy"
              })

callbacks = [
    EarlyStopping(monitor='val_loss', mode="min", verbose=1, patience=5),
    TensorBoard(log_dir=os.path.join(
        "..\\logs\\", str(datetime.now().strftime("%b_%d_%Y_%H_%M_%S"))),
                histogram_freq=1,
                profile_batch=0)
コード例 #17
0
def main():
    args = setup_interact_args()
    args.distributed = False

    device = torch.device('cuda' if args.cuda else 'cpu')

    model_dir = join(args.model_dir, args.model_name)

    state_dict = torch.load(join(model_dir, 'model.pt'), map_location=device)

    _, tokenizer = create_dataset(args=args)

    vocab_size = len(tokenizer)

    model = create_model(args, vocab_size)
    model = model.to(device)

    model.load_state_dict(state_dict['model'])
    model.eval()

    history = []

    select_fn = METHODS[args.method]

    special_ids = tokenizer.convert_tokens_to_ids([
        SP1,
        SP2,
        tokenizer.bos_token,
        tokenizer.eos_token,
        HST,
        RSP,
    ])

    @torch.no_grad()
    def respond(text):
        """
        Responds to the given text.
        """
        history.append(tokenizer.encode(text))

        inputs = transform_dialog(history[:args.max_hist],
                                  special_ids=special_ids)

        preds = decode(args=args,
                       model=model,
                       inputs=inputs,
                       tokenizer=tokenizer,
                       select_fn=select_fn,
                       device=device)

        history.append(preds)

        # last token is the end token
        return tokenizer.decode(preds[:-1])

    print('Type a sentence to translate. ' + 'CTRL + C to escape.')

    while True:
        try:
            print()
            text = input()
            output = respond(text)
            print(output)
            print()

        except KeyboardInterrupt:
            break