def train(net, loss_func, optimizer, train_data, valid_data,
          n_class, device, model_name, epochs=20):
    """
    训练
    :param net: AI网络
    :param loss_func: loss function
    :param optimizer: optimizer
    :param train_data: train data set
    :param valid_data: valid data set
    :param n_class: n种分类
    :param device: torch.device CPU or GPU
    :param model_name: 用于保存模型权重
    :param epochs: 训练多少个EPOCH
    :return:
    """
    for e in range(1, epochs + 1):
        get_logger().info('Epoch: {:02d}'.format(e))
        # 一个epoch训练
        _epoch_train(net, loss_func, optimizer, train_data, n_class, device, e)
        # 每个epoch的参数都保存
        save_dir = save_weight(net, model_name, e)
        get_logger().info(save_dir)  # 日志记录
        # 一个epoch验证
        _epoch_valid(net, loss_func, valid_data, n_class, device, e)
        pass
    pass
def _epoch_valid(net, loss_func, data, n_class, device, i_epoch):
    """
    一个epoch验证
    :param net: AI网络
    :param loss_func: loss function
    :param data: valid data set
    :param n_class: n种分类
    :param device: torch.device CPU or GPU
    :return: loss, miou
    """
    net.to(device)
    net.eval()  # 验证

    total_loss = 0.  # 一个epoch验证的loss
    total_cm = np.zeros((n_class, n_class))  # ndarray
    total_batch_miou = 0.

    with torch.no_grad():  # 验证阶段,不需要计算梯度,节省内存
        bar_format = '{desc}{postfix}|{n_fmt}/{total_fmt}|{percentage:3.0f}%|{bar}|{elapsed}<{remaining}'
        # {desc}{进度条百分比}[{当前/总数}{用时<剩余时间}{自己指定的后面显示的}]
        tqdm_data = tqdm(data,
                         ncols=120,  # 进度条宽120列,linux必须指定,否则按照terminal宽度80
                         bar_format=bar_format,  # 进度条格式
                         desc='Epoch {:02d} Valid'.format(i_epoch))  # 进度条的{desc}
        for i_batch, (im, lb) in enumerate(tqdm_data, start=1):
            im = im.to(device)  # [N,C,H,W] tensor 一个验证batch image
            lb = lb.to(device)  # [N,H,W] tensor 一个验证batch label

            output = net(im)  # [N,C,H,W] tensor 前向传播,计算一个验证batch的output
            loss = loss_func(output, lb.type(torch.long))  # 计算一个验证batch的loss
            batch_loss = loss.detach().item()  # detach还是加上吧,
            total_loss += batch_loss  # 累加验证batch的loss

            # 验证的时候不进行反向传播
            pred = torch.argmax(F.softmax(output, dim=1), dim=1)  # [N,H,W] tensor 将输出转化为dense prediction
            batch_cm = get_confusion_matrix(pred.cpu().numpy(),
                                            lb.cpu().numpy(),
                                            n_class)  # 计算混淆矩阵并累加
            total_cm += batch_cm
            batch_miou = get_metrics(batch_cm, metrics='mean_iou')
            total_batch_miou += batch_miou

            tqdm_str = 'Loss={:.4f}|mIoU={:.4f}|bat_mIoU={:.4f}'  # 进度条
            tqdm_data.set_postfix_str(
                tqdm_str.format(total_loss / i_batch,
                                get_metrics(total_cm, metrics='mean_iou'),
                                total_batch_miou / i_batch))
            pass
        total_loss /= len(data)  # 求取一个epoch验证的loss
        mean_iou = get_metrics(total_cm, metrics='mean_iou')  # float 求mIoU
        total_batch_miou /= len(data)

        # 记录Valid日志
        log_str = ('Valid Loss: {:.4f}|'
                   'Valid mIoU: {:.4f}|'
                   'Valid bat_mIoU: {:.4f}')
        log_str = log_str.format(total_loss, mean_iou, total_batch_miou)
        get_logger().info(log_str)
        return total_loss, mean_iou, total_batch_miou
Example #3
0
def main():
    args = tools.get_args(parser)
    config = tools.get_config(args)
    tools.init(config)
    tb_logger, logger = tools.get_logger(config)
    tools.check_dist_init(config, logger)

    checkpoint = tools.get_checkpoint(config)
    runner = tools.get_model(config, checkpoint)
    loaders = tools.get_data_loader(config)

    if dist.is_master():
        logger.info(config)

    if args.mode == 'train':
        train(config, runner, loaders, checkpoint, tb_logger)
    elif args.mode == 'evaluate':
        evaluate(runner, loaders)
    elif args.mode == 'calc_flops':
        if dist.is_master():
            flops = tools.get_model_flops(config, runner.get_model())
            logger.info('flops: {}'.format(flops))
    elif args.mode == 'calc_params':
        if dist.is_master():
            params = tools.get_model_parameters(runner.get_model())
            logger.info('params: {}'.format(params))
    else:
        assert checkpoint is not None
        from models.dmcp.utils import sample_model
        sample_model(config, runner.get_model())

    if dist.is_master():
        logger.info('Done')
def get_model(model_type, in_channels, n_class, device, load_weight=None):
    """
    获取AI网络
    :param model_type: 网络类型
    :param in_channels: 输入图像通道数
    :param n_class: n种分类
    :param device: torch.device GPU or CPU
    :param load_weight: string已有权重文件的绝对路径,有就加载,默认没有
    :return:
    """
    if model_type == 'fcn8s':
        # raise NotImplementedError
        model = FCN8s(n_class)
    elif model_type == 'unet_resnet152':
        raise NotImplementedError
        # model = unet_resnet('resnet152', in_channels, n_class, pretrained=True)
    elif model_type == 'deeplabv3p_resnet50':
        model = DeepLabV3P('resnet50', in_channels, n_class)
    elif model_type == 'deeplabv3p_resnet101':
        model = DeepLabV3P('resnet101', in_channels, n_class)
    elif model_type == 'deeplabv3p_xception':
        model = DeepLabV3P('xception', in_channels, n_class)
    else:
        raise ValueError('model name error!')
    get_logger().info('-' * 32 + str(model_type) + '-' * 32)

    model.to(device)

    if load_weight is None:
        get_logger().info('Load weight is not specified!')
    elif os.path.exists(load_weight):
        # 有训练好的模型就加载
        get_logger().info(load_weight + ' exists! loading...')
        wt = torch.load(load_weight, map_location=device)
        model.load_state_dict(wt)
    else:
        get_logger().info(load_weight + ' can not be found!')
        pass
    return model
def test(net, data, device, resize_to=256, n_class=8, compare=False):
    """
    测试
    :param net: AI网络
    :param data: test dataset
    :param device: torch.device GPU or CPU
    :param n_class: n种分类
    :param compare: 是否生成对比图片
    :return:
    """
    net.to(device)
    net.eval()  # 测试
    total_cm = np.zeros((n_class, n_class))  # 记录整个测试的混淆矩阵
    total_batch_miou = 0.  # 累加每张图像的mIoU

    offset = 690  # 剪裁690x3384
    pair_crop = PairCrop(offsets=(offset, None))  # 剪裁690x3384
    pair_resize = PairResize(size=resize_to)
    pair_norm_to_tensor = PairNormalizeToTensor(norm=True)  # 归一化并正则化

    with torch.no_grad():  # 测试阶段,不需要计算梯度,节省内存
        bar_format = '{desc}{postfix}|{n_fmt}/{total_fmt}|{percentage:3.0f}%|{bar}|{elapsed}<{remaining}'
        # {desc}{进度条百分比}[{当前/总数}{用时<剩余时间}{自己指定的后面显示的}]
        tqdm_data = tqdm(data, ncols=120, bar_format=bar_format, desc='Test')
        for i_batch, (im, lb) in enumerate(tqdm_data, start=1):
            # if i_batch > 1:
            #     break
            im_t, lb_t = pair_crop(im, lb)  # PIL Image,PIL Image
            im_t, lb_t = pair_resize(im_t, lb_t)  # PIL Image,PIL Image
            im_t, lb_t = pair_norm_to_tensor(im_t,
                                             lb_t)  # [C,H,W]tensor,[H,W]tensor

            im_t = im_t.to(device)  # [C,H,W]tensor装入GPU
            im_t = im_t.unsqueeze(0)  # 转换为[N,C,H,W] tensor
            output = net(im_t)  # 经过模型输出[N,C,H,W] tensor
            pred = torch.argmax(F.softmax(output, dim=1),
                                dim=1)  # [N,H,W] tensor

            pred = pred.unsqueeze(
                1)  # [N,C,H,W] tensor, F.interpolate操作图像需要[N,C,H,W] tensor
            pred = pred.type(
                torch.float
            )  # 转为float数,F.interpolate只对float类型操作,int,long等都没有实现
            pred = F.interpolate(pred,
                                 size=(lb.size[1] - offset, lb.size[0]),
                                 mode='nearest')  # pred用nearest差值
            pred = pred.type(torch.uint8)  # 再转回int类型
            pred = pred.squeeze(0).squeeze(0)  # [H,W]tensor
            pred = pred.cpu().numpy()  # [H,W]ndarray

            supplement = np.zeros((offset, lb.size[0]),
                                  dtype=np.uint8)  # [H,W]ndarray,补充成背景
            pred = np.append(
                supplement, pred,
                axis=0)  # 最终的估值,[H,W]ndarray,在H方向cat,给pred补充被剪裁的690x3384
            batch_cm = get_confusion_matrix(pred, lb, n_class)  # 本张图像的混淆矩阵
            total_cm += batch_cm  # 累加

            if compare:  # 生成对比图
                fontsize = 16  # 图像文字字体大小
                fig, ax = plt.subplots(2, 2, figsize=(20, 15))  # 画布
                ax = ax.flatten()

                ax[0].imshow(im)  # 左上角显示原图
                ax[0].set_title('Input Image', fontsize=fontsize)  # 标题

                ax[1].imshow(LaneSegDataset.decode_rgb(
                    np.asarray(lb)))  # 右上角显示 Grand Truth
                ax[1].set_title('Grand Truth', fontsize=fontsize)  # 标题

                batch_miou = get_metrics(batch_cm,
                                         metrics='mean_iou')  # 计算本张图像的mIoU
                fig.suptitle('mIoU:{:.4f}'.format(batch_miou),
                             fontsize=fontsize)  # 用mIoU作为大标题
                total_batch_miou += batch_miou

                mask = (pred != 0).astype(
                    np.uint8) * 255  # [H,W]ndarray,alpha融合的mask

                pred = LaneSegDataset.decode_rgb(pred)  # [H,W,C=3]ndarray RGB
                ax[3].imshow(pred)  # 右下角显示Pred
                ax[3].set_title('Pred', fontsize=fontsize)  # 标题

                mask = mask[..., np.newaxis]  # [H,W,C=1]ndarray
                pred = np.append(pred, mask,
                                 axis=2)  # [H,W,C=4]ndarray,RGB+alpha变为RGBA

                im = im.convert('RGBA')
                pred = Image.fromarray(pred).convert('RGBA')
                im_comp = Image.alpha_composite(im, pred)  # alpha融合
                ax[2].imshow(im_comp)  # 左下角显示融合图像
                ax[2].set_title('Pred over Input', fontsize=fontsize)  # 标题

                plt.subplots_adjust(left=0.01,
                                    bottom=0.01,
                                    right=0.99,
                                    top=0.99,
                                    wspace=0.01,
                                    hspace=0.01)  # 调整子图边距间距
                plt.savefig('/home/mist/imfolder/pred-{:s}.jpg'.format(
                    now_str()))  # 保存图像
                plt.close(fig)
                pass
            tqdm_str = 'mIoU={:.4f}|bat_mIoU={:.4f}'  # 进度条
            tqdm_data.set_postfix_str(
                tqdm_str.format(get_metrics(total_cm),
                                total_batch_miou / i_batch))
            pass
        mean_iou = get_metrics(total_cm)  # 整个测试的mIoU
        total_batch_miou /= len(data)

        logger = get_logger()
        msg = ('Test mIoU : {:.4f}|'
               'Test bat_mIoU : {:.4f}').format(mean_iou, total_batch_miou)
        logger.info(msg)
        return mean_iou
Example #6
0
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

if __name__ == '__main__':
    # Read configures
    conf = cp.ConfigParser()
    # Project configures
    conf.read('./CleverRec.properties', encoding='utf-8')
    configs = dict(conf.items('default'))
    recommender = configs['recommender']
    # Model configures
    conf.read(os.path.join('./conf/', recommender + '.properties'),
              encoding='utf-8')
    configs.update(dict(conf.items('parameters')))

    # Get logger
    logger = get_logger(configs['log.dir'], recommender)
    logger.info('=' * 100)
    logger.info('Current model: %s' % recommender)

    # Read and preprocess data
    if configs['model_type'] == 'ranking':
        data = RankingPreprocess(configs, logger)
    else:
        data = RatingPreprocess(configs, logger)

    # tf settings
    if configs['gpu.is_gpu']:
        os.environ['CUDA_VISIBLE_DEVICES'] = configs['gpu.id']
        tf_conf = tf.ConfigProto()
        tf_conf.gpu_options.per_process_gpu_memory_fraction = float(
            configs['gpu.mem_frac'])
def _epoch_train(net, loss_func, optimizer, data, n_class, device, i_epoch):
    """
    一个epoch训练
    :param net: AI网络
    :param loss_func: loss function
    :param optimizer: optimizer
    :param data: train data set
    :param n_class: n种分类
    :param device: torch.device CPU or GPU
    :return: loss, miou
    """
    net.to(device)
    net.train()  # 训练

    total_loss = 0.  # 一个epoch训练的loss
    total_cm = np.zeros((n_class, n_class))  # ndarray 一个epoch的混淆矩阵
    total_batch_miou = 0.

    bar_format = '{desc}{postfix}|{n_fmt}/{total_fmt}|{percentage:3.0f}%|{bar}|{elapsed}<{remaining}'
    # {desc}{进度条百分比}[{当前/总数}{用时<剩余时间}{自己指定的后面显示的}]
    tqdm_data = tqdm(data,
                     ncols=120,  # 进度条宽120列,linux必须指定,否则按照terminal宽度80
                     bar_format=bar_format,  # 进度条格式
                     desc='Epoch {:02d} Train'.format(i_epoch))  # 进度条的{desc}
    for i_batch, (im, lb) in enumerate(tqdm_data, start=1):
        im = im.to(device)  # [N,C,H,W] tensor 一个训练batch image
        lb = lb.to(device)  # [N,H,W] tensor 一个训练batch label

        optimizer.zero_grad()  # 清空梯度

        output = net(im)  # [N,C,H,W] tensor 前向传播,计算一个训练batch的output

        loss = loss_func(output, lb.type(torch.long))  # 计算一个训练batch的loss
        batch_loss = loss.detach().item()  # train过程有gradient,必须detach才能读取
        total_loss += batch_loss  # 累加训练batch的loss

        loss.backward()  # 反向传播
        optimizer.step()  # 优化器迭代

        pred = torch.argmax(F.softmax(output, dim=1), dim=1)  # [N,H,W] tensor 将输出转化为dense prediction,减少一个C维度
        batch_cm = get_confusion_matrix(pred.cpu().numpy(),
                                        lb.cpu().numpy(),
                                        n_class)  # 计算混淆矩阵并累加
        total_cm += batch_cm
        batch_miou = get_metrics(batch_cm, metrics='mean_iou')
        total_batch_miou += batch_miou

        tqdm_str = 'Loss={:.4f}|mIoU={:.4f}|bat_mIoU={:.4f}'  # 进度条
        tqdm_data.set_postfix_str(
            tqdm_str.format(total_loss / i_batch,
                            get_metrics(total_cm, metrics='mean_iou'),
                            total_batch_miou / i_batch))
        pass
    total_loss /= len(data)  # float 求取一个epoch的loss
    mean_iou = get_metrics(total_cm, metrics='mean_iou')  # float 求mIoU
    total_batch_miou /= len(data)  # 计算所有batch的miou的平均

    # 记录Train日志
    log_str = ('Train Loss: {:.4f}|'
               'Train mIoU: {:.4f}|'
               'Train bat_mIoU: {:.4f}')
    log_str = log_str.format(total_loss, mean_iou, total_batch_miou)
    get_logger().info(log_str)
    return total_loss, mean_iou, total_batch_miou
Example #8
0
def worker(gpu, ngpus_per_node, args_in):
    # init
    args = copy.deepcopy(args_in)
    jobid = os.environ["SLURM_JOBID"]
    procid = int(os.environ["SLURM_PROCID"])
    args.gpu = gpu

    if args.gpu is not None:
        logger_name = "{}.{}-{:d}-{:d}.search.log".format(
            args.name, jobid, procid, gpu)
    else:
        logger_name = "{}.{}-{:d}-all.search.log".format(
            args.name, jobid, procid)

    logger = tools.get_logger(os.path.join(args.path, logger_name))

    if args.dist_url == "env://" and args.rank == -1:
        args.rank = int(os.environ["RANK"])

    if args.mp_dist:
        # For multiprocessing distributed training, rank needs to be the
        # global rank among all the processes
        args.rank = args.rank * ngpus_per_node + gpu

    args.print_params(logger.info)

    # get cuda device
    device = torch.device('cuda', gpu)

    # begin
    logger.info("Logger is set - training start")

    logger.info('back:{}, dist_url:{}, world_size:{}, rank:{}'.format(
        args.dist_backend, args.dist_url, args.world_size, args.rank))
    dist.init_process_group(backend=args.dist_backend,
                            init_method=args.dist_url,
                            world_size=args.world_size,
                            rank=args.rank)

    data_parser = {
        'ETTh1': {
            'data': 'ETTh1.csv',
            'T': 'OT',
            'M': [7, 7, 7],
            'S': [1, 1, 1],
            'MS': [7, 7, 1]
        },
        'ETTh2': {
            'data': 'ETTh2.csv',
            'T': 'OT',
            'M': [7, 7, 7],
            'S': [1, 1, 1],
            'MS': [7, 7, 1]
        },
        'ETTm1': {
            'data': 'ETTm1.csv',
            'T': 'OT',
            'M': [7, 7, 7],
            'S': [1, 1, 1],
            'MS': [7, 7, 1]
        },
        'ETTm2': {
            'data': 'ETTm2.csv',
            'T': 'OT',
            'M': [7, 7, 7],
            'S': [1, 1, 1],
            'MS': [7, 7, 1]
        },
        'WTH': {
            'data': 'WTH.csv',
            'T': 'WetBulbCelsius',
            'M': [12, 12, 12],
            'S': [1, 1, 1],
            'MS': [12, 12, 1]
        },
        'ECL': {
            'data': 'ECL.csv',
            'T': 'MT_320',
            'M': [321, 321, 321],
            'S': [1, 1, 1],
            'MS': [321, 321, 1]
        },
        'Solar': {
            'data': 'solar_AL.csv',
            'T': 'POWER_136',
            'M': [137, 137, 137],
            'S': [1, 1, 1],
            'MS': [137, 137, 1]
        },
    }
    if args.data in data_parser.keys():
        data_info = data_parser[args.data]
        args.data_path = data_info['data']
        args.target = data_info['T']
        args.enc_in, args.dec_in, args.c_out = data_info[args.features]

    args.s_layers = [
        int(s_l) for s_l in args.s_layers.replace(' ', '').split(',')
    ]
    args.detail_freq = args.freq
    args.freq = args.freq[-1:]

    Exp = Exp_M_Informer

    for ii in range(args.itr):
        # setting record of experiments
        setting = '{}_{}_ft{}_sl{}_ll{}_pl{}_dm{}_nh{}_el{}_dl{}_df{}_at{}_fc{}_eb{}_dt{}_mx{}_{}_{}'.format(
            args.model, args.data, args.features, args.seq_len, args.label_len,
            args.pred_len, args.d_model, args.n_heads, args.e_layers,
            args.d_layers, args.d_ff, args.attn, args.factor, args.embed,
            args.distil, args.mix, args.des, ii)

        exp = Exp(args)  # set experiments
        logger.info(
            '>>>>>>>start training : {}>>>>>>>>>>>>>>>>>>>>>>>>>>'.format(
                setting))
        exp.train(ii, logger)

        logger.info(
            '>>>>>>>testing : {}<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<'.format(
                setting))
        exp.test(setting, logger)

        if args.do_predict:
            logger.info(
                '>>>>>>>predicting : {}<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<'.
                format(setting))
            exp.predict(setting, True)

        torch.cuda.empty_cache()