Beispiel #1
0
def main():
    # 读取配置文件
    with open('config/default.yml') as fin:
        config = yaml.load(fin, Loader=yaml.SafeLoader)

    # 生成 train 和 valid 数据集
    train_config = config['dataset']['train']
    train_df = pd.read_csv(train_config['data_path'], sep='\t')
    train_df.sample(frac=1)
    train, valid = train_test_split(train_df,
                                    test_size=config['train_valid_split'])
    train_dataset = build_dataloader(train, train_config, device=device)
    valid_dataset = build_dataloader(valid, train_config, device=device)

    # 建立模型
    model_config = config['model']
    model = BertClassifier(model_config)
    model.to(device)
    optimizer = build_optimizer(model, config['optimizer'])

    # 计算训练步数
    num_train_steps = int(
        len(train_dataset) / train_dataset.batch_size * config['num_epochs'])
    num_warmup_steps = int(num_train_steps *
                           config['optimizer']['warmup_proportion'])
    scheduler = build_scheduler(optimizer, num_train_steps, num_warmup_steps)

    # 训练
    trainer.do_train(model,
                     train_loader=train_dataset,
                     valid_loader=valid_dataset,
                     optimizer=optimizer,
                     scheduler=scheduler,
                     cfg=config)
Beispiel #2
0
def main():
    logger = get_logger()

    global_config = config['Global']

    # 初始化设备
    use_gpu = global_config['use_gpu']
    if global_config['local_rank'] == -1 or not use_gpu:
        device = torch.device(
            "cuda" if torch.cuda.is_available() and use_gpu else "cpu")
        global_config.update(
            {'n_gpu': torch.cuda.device_count() if use_gpu else 1})
    else:
        torch.cuda.set_device(global_config['local_rank'])
        device = torch.device('cuda', global_config['local_rank'])
        dist.init_process_group(backend='nccl')
        global_config.update({'n_gpu': 1})
    global_config.update({'device': device})
    logger.warning(
        f"\n\tProcess Rank:{global_config['local_rank']} \n"
        f"\tDevice: {device}\n"
        f"\tGpus: {global_config['n_gpu']}\n"
        f"\tDistributed: {bool(global_config['local_rank'] != -1)}\n"
        f"\t16-bits training: {global_config['fp16']}")

    rank_id = global_config['local_rank']
    set_seed(global_config['seed'], use_gpu)

    # 阻塞子进程,下面的操作仅主进程进行
    if not is_main_process(rank_id):
        dist.barrier()

    post_process = build_post_process(config['PostProcess'], global_config)

    # 构建模型
    arch_config = config.pop('Architecture')
    if hasattr(post_process, 'character'):
        char_num = len(getattr(post_process, 'character'))
        arch_config["Head"]['out_channels'] = char_num
    logger.info(f"\nModel Info:" f"\n{json.dumps(arch_config, indent=4)}")
    model = build_model(arch_config)
    state_dict = torch.load(global_config['pretrained_model'])
    model.load_state_dict(state_dict)

    # 加载训练数据
    if global_config['local_rank'] == 0:
        dist.barrier()
    logger.info(f"\nLoad train Data:"
                f"\n{json.dumps(config['Train'], indent=4)}")
    train_dataloader = build_dataloader(config, logger, 'Train')

    logger.info(f"\nLoad Eval Data:"
                f"\n{json.dumps(config['Eval'], indent=4)}")
    eval_dataloader = build_dataloader(config, logger, 'Eval')
    if global_config['local_rank'] == 0:
        dist.barrier()

    model.to(device)
def train(cfg, local_rank, distributed):

    num_classes = COCODataset(cfg.data.train[0], cfg.data.train[1]).num_classes
    model = EfficientDet(num_classes=num_classes, model_name=cfg.model.name)
    inp_size = model.config['inp_size']
    device = torch.device(cfg.device)
    model.to(device)

    optimizer = build_optimizer(model, **optimizer_kwargs(cfg))
    lr_scheduler = build_lr_scheduler(optimizer, **lr_scheduler_kwargs(cfg))

    use_mixed_precision = cfg.dtype == "float16"
    amp_opt_level = 'O1' if use_mixed_precision else 'O0'
    model, optimizer = amp.initialize(model,
                                      optimizer,
                                      opt_level=amp_opt_level)

    if distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[local_rank],
            output_device=local_rank,
            # this should be removed if we update BatchNorm stats
            broadcast_buffers=False,
            find_unused_parameters=True)

    arguments = {}
    arguments["iteration"] = 0
    output_dir = cfg.output_dir
    save_to_disk = comm.get_rank() == 0
    checkpointer = Checkpointer(model, optimizer, lr_scheduler, output_dir,
                                save_to_disk)
    extra_checkpoint_data = checkpointer.load(cfg.model.resume)
    arguments.update(extra_checkpoint_data)

    train_dataloader = build_dataloader(cfg,
                                        inp_size,
                                        is_train=True,
                                        distributed=distributed,
                                        start_iter=arguments["iteration"])

    test_period = cfg.test.test_period
    if test_period > 0:
        val_dataloader = build_dataloader(cfg,
                                          inp_size,
                                          is_train=False,
                                          distributed=distributed)
    else:
        val_dataloader = None

    checkpoint_period = cfg.solver.checkpoint_period
    log_period = cfg.solver.log_period

    do_train(cfg, model, train_dataloader, val_dataloader, optimizer,
             lr_scheduler, checkpointer, device, checkpoint_period,
             test_period, log_period, arguments)

    return model
Beispiel #4
0
def eval_run(run_dir, batch_size=128,
    epoch=None, verbose=True, tqdm_leave=True):

  cfg = load_config(run_dir)
  datasets_dir = config.get('DATASETS_DIR')
  trn_dl = build_dataloader(datasets_dir, cfg.ds,
      cfg.split, 'train', batch_size, shuffle=False)
  tst_dl = build_dataloader(datasets_dir, cfg.ds,
      cfg.split, 'test', batch_size, shuffle=False)

  if epoch is not None:
    print(f'Evaluating {cfg.run} at epoch {epoch}')
    model = load_model(run_dir, cfg, epoch)
    trn_loss, trn_acc = eval_subset(model, trn_dl)
    tst_loss, tst_acc = eval_subset(model, tst_dl)
    print(
      f'{cfg.run}'
      f' loss=({trn_loss:.2f},{tst_loss:.2f})'
      f' acc=({trn_acc:.2f},{tst_acc:.2f})'
    )
    return

  trn_dir = join(run_dir, 'etrn')
  tst_dir = join(run_dir, 'etst')
  if isdir(trn_dir):
    shutil.rmtree(trn_dir)
  if isdir(tst_dir):
    shutil.rmtree(tst_dir)
  trn_writer = tf.summary.create_file_writer(trn_dir)
  tst_writer = tf.summary.create_file_writer(tst_dir)

  if verbose:
    print(f'Evaluating {cfg.run}')
  best_acc, best_epoch = 0, 0
  for epoch in trange(cfg.epochs, leave=tqdm_leave):
    model = load_model(run_dir, cfg, epoch)
    trn_loss, trn_acc = eval_subset(model, trn_dl)
    tst_loss, tst_acc = eval_subset(model, tst_dl)
    with trn_writer.as_default():
      tf.summary.scalar(f'loss/{cfg.ds}', trn_loss, epoch)
      tf.summary.scalar(f'acc/{cfg.ds}', trn_acc, epoch)
    with tst_writer.as_default():
      tf.summary.scalar(f'loss/{cfg.ds}', tst_loss, epoch)
      tf.summary.scalar(f'acc/{cfg.ds}', tst_acc, epoch)
    if tst_acc > best_acc:
      best_acc, best_epoch = tst_acc, epoch

  firsts = ['run', 'ds', 'split']
  columns = [k for k in sorted(cfg.keys()) if k not in firsts]
  columns = firsts + ['acc', 'epoch'] + columns
  data = dict(cfg)
  data['acc'] = best_acc
  data['epoch'] = best_epoch
  df = pd.DataFrame(data, columns=columns, index=[0])
  df.to_csv(f'{run_dir}/results.csv')
  if verbose:
    print(df.head())
Beispiel #5
0
    def __init__(self, opt):
        '''
        opt in type <class 'argparse.Namespace'>
        '''
        self.opt = opt
        self.device = self.opt.device

        train_dataloader, test_dataloader, self.train_num, self.test_num = build_dataloader(
            self.device)
        self.dataloader = {'train': train_dataloader, 'test': test_dataloader}

        self.net = PredNet(img_shape=self.opt.shape,
                           num_masks=self.opt.num_masks,
                           is_robot_state_used=1,
                           iter_num=-1,
                           k=900,
                           device=self.device)
        self.net.to(self.device)

        print('Net has', sum(param.numel() for param in self.net.parameters()),
              'parameters...')

        self.mse_loss = torch.nn.MSELoss()
        self.w_state = 1e-4  # TODO problems

        if self.opt.pretrained_model_path:
            self.load_weight()

        self.optimizer = torch.optim.Adam(self.net.parameters(),
                                          self.opt.learning_rate)
Beispiel #6
0
    def __init__(self, name, full_name, datasets_dir, run_dir, cfg):
        self.name = name
        self.writer = tf.summary.create_file_writer(join(run_dir, name))
        self.tasks = []
        batch_size = cfg.train_batch // len(cfg._dss)
        Task = namedtuple('Task', ('name', 'dl', 'loss', 'acc'))
        if name == 'trn':
            transform = cfg.dss_augment
            sampling = cfg.dss_sampling
            # batch_size = cfg.train_tbatch
            shuffle = True
        else:
            transform = False
            sampling = 'fixed'
            # batch_size = cfg.train_ebatch
            shuffle = False
        for ds in cfg._dss:
            dl = build_dataloader(datasets_dir=datasets_dir,
                                  ds=ds.name,
                                  split=ds.split,
                                  subset=full_name,
                                  transform=transform,
                                  sampling=sampling,
                                  cache=cfg.dss_cache,
                                  batch_size=batch_size,
                                  shuffle=shuffle,
                                  num_workers=cfg.dss_num_workers)
            loss = tf.keras.metrics.SparseCategoricalCrossentropy()
            acc = tf.keras.metrics.SparseCategoricalAccuracy()
            self.tasks.append(Task(ds.name, dl, loss, acc))

        self.dls = [task.dl for task in self.tasks]
def train(config, experiment_name=None):
    num_classes = config.MODEL.NUM_CLASSES

    # dataloader for training
    train_period = 'train'
    train_loader = build_dataloader(cfg=config,
                                    period=train_period,
                                    loader_type='train')
    val_loader = build_dataloader(cfg=config,
                                  period=train_period,
                                  loader_type='val')

    # prepare model
    model = build_model(cfg=config)

    print('The loss type is', config.MODEL.LOSS_TYPE)
    loss_func = build_loss(config, num_classes)
    optimizer = build_optimizer(config, model)

    # Add for using self trained model
    if config.MODEL.PRETRAIN_CHOICE == 'self':
        start_epoch = eval(
            config.MODEL.PRETRAIN_PATH.split('/')[-1].split('.')[0].split('_')
            [-1])
        print('Start epoch:', start_epoch)
        path_to_optimizer = config.MODEL.PRETRAIN_PATH.replace(
            'model', 'optimizer')
        print('Path to the checkpoint of optimizer:', path_to_optimizer)
        model.load_state_dict(torch.load(config.MODEL.PRETRAIN_PATH))
        optimizer.load_state_dict(torch.load(path_to_optimizer))

    scheduler = WarmUpMultiStepLR(optimizer, config.SOLVER.STEPS,
                                  config.SOLVER.GAMMA,
                                  config.SOLVER.WARMUP_FACTOR,
                                  config.SOLVER.WARMUP_ITERS,
                                  config.SOLVER.WARMUP_METHOD)

    print('------------------ Start Training -------------------')
    do_train(config, model, train_loader, val_loader, optimizer, scheduler,
             loss_func, experiment_name)
    print('---------------- Training Completed ---------------- ')
Beispiel #8
0
def train(cfg):
    model_dir = join(config.get('RESULTS_DIR'), cfg.exp_name, cfg.run)
    print(f"Trainig {cfg.run}")
    cfg.save_params(model_dir)

    datasets_dir = config.get('DATASETS_DIR')
    trn_dl = build_dataloader(datasets_dir, cfg.ds, cfg.split, 'train',
                              cfg.tbatch_size)
    etrn_dl = build_dataloader(datasets_dir, cfg.ds, cfg.split, 'train',
                               cfg.ebatch_size)
    etst_dl = build_dataloader(datasets_dir, cfg.ds, cfg.split, 'test',
                               cfg.ebatch_size)

    num_classes = 51 if cfg.ds == 'hmdb51' else 101
    ModelClass = models.get_model_class(cfg.model)
    model = ModelClass(cfg, num_classes)

    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy()
    optimizer = tf.keras.optimizers.SGD(learning_rate=cfg.lr)
    trn_loss_epoch = tf.keras.metrics.SparseCategoricalCrossentropy()
    trn_acc_epoch = tf.keras.metrics.SparseCategoricalAccuracy()
    tst_loss_epoch = tf.keras.metrics.SparseCategoricalCrossentropy()
    tst_acc_epoch = tf.keras.metrics.SparseCategoricalAccuracy()

    trn_writer = tf.summary.create_file_writer(join(model_dir, 'trn'))
    tst_writer = tf.summary.create_file_writer(join(model_dir, 'tst'))

    trn_eval_step = (etrn_dl, trn_loss_epoch, trn_acc_epoch)
    tst_eval_step = (etst_dl, tst_loss_epoch, tst_acc_epoch)
    trn_eval_epoch = (trn_loss_epoch, trn_acc_epoch, trn_writer)
    tst_eval_epoch = (tst_loss_epoch, tst_acc_epoch, tst_writer)

    weights_dir = join(model_dir, 'weights')
    for epoch in trange(cfg.epochs):
        for x, y_true in trn_dl:
            train_step(x, y_true, model, loss_fn, optimizer)
            eval_step(model, trn_eval_step, tst_eval_step)
        eval_epoch(epoch, cfg.ds, trn_eval_epoch, tst_eval_epoch)
        model.save_weights(join(weights_dir, f'{epoch:03d}.ckpt'))
def test(config, experiment_name=None):
    # dataloader for test
    test_period = 'test'
    test_loader = build_dataloader(cfg=config,
                                   period=test_period,
                                   loader_type='test')

    # prepare model
    model = build_model(cfg=config)
    model.load_param(config.TEST.WEIGHT)

    print('------------------ Start Test -------------------')
    do_test(config, model, test_loader, experiment_name)
    print('---------------- Inference Completed -----------------')
Beispiel #10
0
def main():
    global_config = config['Global']

    use_gpu = global_config['use_gpu']
    n_gpus = 1
    device = torch.device('cpu')
    if use_gpu:
        if torch.cuda.is_available():
            n_gpus = torch.cuda.device_count()
            device = torch.device('cuda')
        else:
            logger.warning("未发现可用于计算的GPU设备")

    # 创建数据集
    config['Eval']['loader'].update({
        'batch_size':
        config['Eval']['loader']['batch_size_per_card'] * n_gpus
    })
    dataloader = build_dataloader(config, device, logger, 'Eval')
    batch_size = config['Eval']['loader']['batch_size']
    logger.info(f'测试数据共 {len(dataloader)}个batch, 每个batch包含{batch_size}个样本')

    post_process_class = build_post_process(config['PostProcess'],
                                            global_config)
    if hasattr(post_process_class, 'character'):
        config['Architecture']["Head"]['out_channels'] = len(
            getattr(post_process_class, 'character'))
    model = build_model(config['Architecture'])

    # 加载预训练模型
    state_dict = torch.load(global_config['pretrained_model'],
                            map_location=torch.device('cpu'))
    model.load_state_dict(state_dict)
    model.to(device)

    eval_class = build_metric(config['Metric'])
    metric = train_utils.eval(model, dataloader, post_process_class,
                              eval_class, device)
    logger.info('metric eval ***************')
    for k, v in metric.items():
        logger.info('{}:{}'.format(k, v))
Beispiel #11
0
def build_tasks_eval(datasets_dir, run_dir, batch_size, cfg):
    """Builds tasks evaluation object."""
    TasksEval = namedtuple('TasksEval', ('etrn', 'etst'))
    Subset = namedtuple('Subset', ('tasks', 'writer'))
    Task = namedtuple('Task', ('name', 'dl', 'loss', 'acc'))
    subsets = []
    for alias, name in zip(('etrn', 'etst'), ('train', 'test')):
        tasks = []
        for ds in cfg._dss:
            dl = build_dataloader(datasets_dir=datasets_dir,
                                  ds=ds.name,
                                  split=ds.split,
                                  subset=name,
                                  batch_size=batch_size,
                                  cache=True)
            loss = tf.keras.metrics.SparseCategoricalCrossentropy()
            acc = tf.keras.metrics.SparseCategoricalAccuracy()
            tasks.append(Task(ds.name, dl, loss, acc))
        writer = tf.summary.create_file_writer(join(run_dir, alias))
        subsets.append(Subset(tasks, writer))
    tasks_eval = TasksEval(*subsets)
    return tasks_eval
    def __init__(self, opt):
        self.opt = opt
        self.device = self.opt.device

        train_dataloader, valid_dataloader = build_dataloader(opt)
        self.dataloader = {
            'train': train_dataloader,
            'valid': valid_dataloader
        }

        self.net = network(self.opt.channels, self.opt.height, self.opt.width,
                           -1, self.opt.schedsamp_k, self.opt.use_state,
                           self.opt.num_masks, self.opt.model == 'STP',
                           self.opt.model == 'CDNA', self.opt.model == 'DNA',
                           self.opt.context_frames)
        self.net.to(self.device)
        self.mse_loss = nn.MSELoss()
        self.w_state = 1e-4
        if self.opt.pretrained_model:
            self.load_weight()
        self.optimizer = torch.optim.Adam(self.net.parameters(),
                                          self.opt.learning_rate)
except:
    signals, labels = get_ecg(PATH, length=LENGTH)
    segments = np.zeros((245990, 1001))
    k = 0

    for i, record in enumerate(signals):
        rp = qrs_detection(record, sample_rate=FS)
        seg = get_segments(record, rp, labels[i])
        if seg is not None:
            segments[k:k + seg.shape[0], :] = seg
            k += seg.shape[0]
    del signals, labels

    np.save('./data/segment.npy', segments)

X, y = segments[:, :-1], segments[:, -1][:, np.newaxis]
del segments

train, test = build_dataloader(X, y, resamp=RESAMP, batch_size=BATCH_SIZE)
del X, y

net = cnn_feed_lstm()
try:
    params = torch.load("../params/net_0.81.pkl")
    net.load_state_dict(params["model_state_dict"])
except:
    pass

loss, val_score = learn(net, train, test, lr=LR, epoch=EPOCH)
plot(loss, val_score)
Beispiel #14
0
def main(config):
    os.environ['CUDA_VISIBLE_DEVICES'] = config.GPU

    if not config.EVAL_MODE:
        sys.stdout = Logger(osp.join(config.OUTPUT, 'log_train.txt'))
    else:
        sys.stdout = Logger(osp.join(config.OUTPUT, 'log_test.txt'))
    print("==========\nConfig:{}\n==========".format(config))
    print("Currently using GPU {}".format(config.GPU))
    # Set random seed
    set_seed(config.SEED)

    # Build dataloader
    trainloader, queryloader, galleryloader, num_classes = build_dataloader(
        config)
    # Build model
    model, classifier = build_model(config, num_classes)
    # Build classification and pairwise loss
    criterion_cla, criterion_pair = build_losses(config)
    # Build optimizer
    parameters = list(model.parameters()) + list(classifier.parameters())
    if config.TRAIN.OPTIMIZER.NAME == 'adam':
        optimizer = optim.Adam(
            parameters,
            lr=config.TRAIN.OPTIMIZER.LR,
            weight_decay=config.TRAIN.OPTIMIZER.WEIGHT_DECAY)
    elif config.TRAIN.OPTIMIZER.NAME == 'adamw':
        optimizer = optim.AdamW(
            parameters,
            lr=config.TRAIN.OPTIMIZER.LR,
            weight_decay=config.TRAIN.OPTIMIZER.WEIGHT_DECAY)
    elif config.TRAIN.OPTIMIZER.NAME == 'sgd':
        optimizer = optim.SGD(parameters,
                              lr=config.TRAIN.OPTIMIZER.LR,
                              momentum=0.9,
                              weight_decay=config.TRAIN.OPTIMIZER.WEIGHT_DECAY,
                              nesterov=True)
    else:
        raise KeyError("Unknown optimizer: {}".format(
            config.TRAIN.OPTIMIZER.NAME))
    # Build lr_scheduler
    scheduler = lr_scheduler.MultiStepLR(
        optimizer,
        milestones=config.TRAIN.LR_SCHEDULER.STEPSIZE,
        gamma=config.TRAIN.LR_SCHEDULER.DECAY_RATE)

    start_epoch = config.TRAIN.START_EPOCH
    if config.MODEL.RESUME:
        print("Loading checkpoint from '{}'".format(config.MODEL.RESUME))
        checkpoint = torch.load(config.MODEL.RESUME)
        model.load_state_dict(checkpoint['state_dict'])
        start_epoch = checkpoint['epoch']

    model = nn.DataParallel(model).cuda()
    classifier = nn.DataParallel(classifier).cuda()

    if config.EVAL_MODE:
        print("Evaluate only")
        test(model, queryloader, galleryloader)
        return

    start_time = time.time()
    train_time = 0
    best_rank1 = -np.inf
    best_epoch = 0
    print("==> Start training")
    for epoch in range(start_epoch, config.TRAIN.MAX_EPOCH):
        start_train_time = time.time()
        train(epoch, model, classifier, criterion_cla, criterion_pair,
              optimizer, trainloader)
        train_time += round(time.time() - start_train_time)

        if (epoch+1) > config.TEST.START_EVAL and config.TEST.EVAL_STEP > 0 and \
            (epoch+1) % config.TEST.EVAL_STEP == 0 or (epoch+1) == config.TRAIN.MAX_EPOCH:
            print("==> Test")
            rank1 = test(model, queryloader, galleryloader)
            is_best = rank1 > best_rank1
            if is_best:
                best_rank1 = rank1
                best_epoch = epoch + 1

            state_dict = model.module.state_dict()
            save_checkpoint(
                {
                    'state_dict': state_dict,
                    'rank1': rank1,
                    'epoch': epoch,
                }, is_best,
                osp.join(config.OUTPUT,
                         'checkpoint_ep' + str(epoch + 1) + '.pth.tar'))
        scheduler.step()

    print("==> Best Rank-1 {:.1%}, achieved at epoch {}".format(
        best_rank1, best_epoch))

    elapsed = round(time.time() - start_time)
    elapsed = str(datetime.timedelta(seconds=elapsed))
    train_time = str(datetime.timedelta(seconds=train_time))
    print(
        "Finished. Total elapsed time (h:m:s): {}. Training time (h:m:s): {}.".
        format(elapsed, train_time))