Esempio n. 1
0
    def __init__(self, data_loader, epochs, save_epoch, model_path, numTransform, numRef):
        self.data_loader = data_loader
        self.epochs = epochs
        self.model_path = model_path
        self.save_epoch = save_epoch
        self.numTransform = numTransform
        self.numRef = numRef

        self.G = Generator(numTransform, numRef)
        self.D = Discriminator(numTransform, numRef)
        self.G_optim = optim.SGD(self.G.parameters(), lr=1e-3, momentum=0.9)
        self.D_optim = optim.SGD(self.D.parameters(), lr=1e-3, momentum=0.9)

        if self.gpu_mode:
            self.G.cuda()
            self.D.cuda()
            self.BCE_loss = nn.BCELoss().cuda()
            self.L1_Loss = nn.L1Loss().cuda()
        else:
            self.BCE_loss = nn.BCELoss()
            self.L1_Loss = nn.L1Loss()

        self.save_path = model_path + '/model_%d.weights'
        logdir = model_path + "/tmp"
        logger = LogWriter(logdir, sync_cycle=10000)

        with logger.mode("train"):
            self.log_D_real_loss = logger.scalar("D/real_loss")
            self.log_D_fake_loss = logger.scalar("D/fake_loss")
            self.log_D_total_loss = logger.scalar("D/total_loss")
            self.log_G_D_loss = logger.scalar("G/D_Loss")
            self.log_G_L1_loss = logger.scalar("G/L1_Loss")
            self.log_G_total_loss = logger.scalar("G/total_Loss")

        with logger.mode("test"):
            self.log_test_loss = logger.scalar("test/loss")
Esempio n. 2
0
mnist = mx.test_utils.get_mnist()
batch_size = 100

# Provide a folder to store data for log, model, image, etc. VisualDL's visualization will be
# based on this folder.
logdir = "./tmp"

# Initialize a logger instance. Parameter 'sync_cycle' means write a log every 10 operations on
# memory.
logger = LogWriter(logdir, sync_cycle=10)

# mark the components with 'train' label.
with logger.mode("train"):
    # scalar0 is used to record scalar metrics while MXNet is training. We will record accuracy.
    # In the visualization, we can see the accuracy is increasing as more training steps happen.
    scalar0 = logger.scalar("scalars/scalar0")
    image0 = logger.image("images/image0", 1)
    histogram0 = logger.histogram("histogram/histogram0", num_buckets=100)

# Record training steps
cnt_step = 0


# MXNet provides many callback interface. Here we define our own callback method and it is called
# after every batch.
# https://mxnet.incubator.apache.org/api/python/callback/callback.html
def add_scalar():
    def _callback(param):
        with logger.mode("train"):
            global cnt_step
            # Here the value is the accuracy we want to record
Esempio n. 3
0
def main():
    env = os.environ
    FLAGS.dist = 'PADDLE_TRAINER_ID' in env and 'PADDLE_TRAINERS_NUM' in env
    if FLAGS.dist:
        trainer_id = int(env['PADDLE_TRAINER_ID'])
        import random
        local_seed = (99 + trainer_id)
        random.seed(local_seed)
        np.random.seed(local_seed)

    cfg = load_config(FLAGS.config)
    if 'architecture' in cfg:
        main_arch = cfg.architecture
    else:
        raise ValueError("'architecture' not specified in config file.")

    merge_config(FLAGS.opt)

    if 'log_iter' not in cfg:
        cfg.log_iter = 20

    # check if set use_gpu=True in paddlepaddle cpu version
    check_gpu(cfg.use_gpu)
    # check if paddlepaddle version is satisfied
    check_version()
    if not FLAGS.dist or trainer_id == 0:
        print_total_cfg(cfg)

    if cfg.use_gpu:
        devices_num = fluid.core.get_cuda_device_count()
    else:
        devices_num = int(os.environ.get('CPU_NUM', 1))

    if 'FLAGS_selected_gpus' in env:
        device_id = int(env['FLAGS_selected_gpus'])
    else:
        device_id = 0
    place = fluid.CUDAPlace(device_id) if cfg.use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)

    scheduler = cfg.LearningRate['schedulers'][0]
    if isinstance(scheduler,
                  CosineDecayWithWarmup) and scheduler.max_iters is None:
        scheduler.max_iters = cfg.max_iters
    lr_builder = create('LearningRate')
    optim_builder = create('OptimizerBuilder')

    # build program
    startup_prog = fluid.Program()
    train_prog = fluid.Program()
    with fluid.program_guard(train_prog, startup_prog):
        with fluid.unique_name.guard():
            model = create(main_arch)
            if FLAGS.fp16:
                assert (getattr(model.backbone, 'norm_type', None)
                        != 'affine_channel'), \
                    '--fp16 currently does not support affine channel, ' \
                    ' please modify backbone settings to use batch norm'

            with mixed_precision_context(FLAGS.loss_scale, FLAGS.fp16) as ctx:
                inputs_def = cfg['TrainReader']['inputs_def']
                feed_vars, train_loader = model.build_inputs(**inputs_def)
                train_fetches = model.train(feed_vars)
                loss = train_fetches['loss']
                if FLAGS.fp16:
                    loss *= ctx.get_loss_scale_var()
                lr = lr_builder()
                optimizer = optim_builder(lr)
                optimizer.minimize(loss)
                if FLAGS.fp16:
                    loss /= ctx.get_loss_scale_var()

    # parse train fetches
    train_keys, train_values, _ = parse_fetches(train_fetches)
    train_values.append(lr)

    if FLAGS.eval:
        eval_prog = fluid.Program()
        with fluid.program_guard(eval_prog, startup_prog):
            with fluid.unique_name.guard():
                model = create(main_arch)
                inputs_def = cfg['EvalReader']['inputs_def']
                feed_vars, eval_loader = model.build_inputs(**inputs_def)
                fetches = model.eval(feed_vars)
        eval_prog = eval_prog.clone(True)

        eval_reader = create_reader(cfg.EvalReader)
        eval_loader.set_sample_list_generator(eval_reader, place)

        # parse eval fetches
        extra_keys = []
        if cfg.metric == 'COCO':
            extra_keys = ['im_info', 'im_id', 'im_shape']
        if cfg.metric == 'VOC':
            extra_keys = ['gt_bbox', 'gt_class', 'is_difficult']
        if cfg.metric == 'WIDERFACE':
            extra_keys = ['im_id', 'im_shape', 'gt_bbox']
        eval_keys, eval_values, eval_cls = parse_fetches(
            fetches, eval_prog, extra_keys)

    # compile program for multi-devices
    build_strategy = fluid.BuildStrategy()
    build_strategy.fuse_all_optimizer_ops = False
    # only enable sync_bn in multi GPU devices
    sync_bn = getattr(model.backbone, 'norm_type', None) == 'sync_bn'
    build_strategy.sync_batch_norm = sync_bn and devices_num > 1 \
        and cfg.use_gpu

    exec_strategy = fluid.ExecutionStrategy()
    # iteration number when CompiledProgram tries to drop local execution scopes.
    # Set it to be 1 to save memory usages, so that unused variables in
    # local execution scopes can be deleted after each iteration.
    exec_strategy.num_iteration_per_drop_scope = 1
    if FLAGS.dist:
        dist_utils.prepare_for_multi_process(exe, build_strategy, startup_prog,
                                             train_prog)
        exec_strategy.num_threads = 1

    exe.run(startup_prog)
    compiled_train_prog = fluid.CompiledProgram(train_prog).with_data_parallel(
        loss_name=loss.name,
        build_strategy=build_strategy,
        exec_strategy=exec_strategy)

    if FLAGS.eval:
        compiled_eval_prog = fluid.compiler.CompiledProgram(eval_prog)

    fuse_bn = getattr(model.backbone, 'norm_type', None) == 'affine_channel'

    ignore_params = cfg.finetune_exclude_pretrained_params \
                 if 'finetune_exclude_pretrained_params' in cfg else []

    start_iter = 0
    if FLAGS.resume_checkpoint:
        checkpoint.load_checkpoint(exe, train_prog, FLAGS.resume_checkpoint)
        start_iter = checkpoint.global_step()
    elif cfg.pretrain_weights and fuse_bn and not ignore_params:
        checkpoint.load_and_fusebn(exe, train_prog, cfg.pretrain_weights)
    elif cfg.pretrain_weights:
        checkpoint.load_params(exe,
                               train_prog,
                               cfg.pretrain_weights,
                               ignore_params=ignore_params)

    train_reader = create_reader(cfg.TrainReader,
                                 (cfg.max_iters - start_iter) * devices_num,
                                 cfg)
    train_loader.set_sample_list_generator(train_reader, place)

    # whether output bbox is normalized in model output layer
    is_bbox_normalized = False
    if hasattr(model, 'is_bbox_normalized') and \
            callable(model.is_bbox_normalized):
        is_bbox_normalized = model.is_bbox_normalized()

    # if map_type not set, use default 11point, only use in VOC eval
    map_type = cfg.map_type if 'map_type' in cfg else '11point'

    train_stats = TrainingStats(cfg.log_smooth_window, train_keys)
    train_loader.start()
    start_time = time.time()
    end_time = time.time()

    cfg_name = os.path.basename(FLAGS.config).split('.')[0]
    save_dir = os.path.join(cfg.save_dir, cfg_name)
    time_stat = deque(maxlen=cfg.log_smooth_window)
    best_box_ap_list = [0.0, 0]  #[map, iter]

    # use tb-paddle to log data
    if FLAGS.use_tb:
        from tb_paddle import SummaryWriter
        tb_writer = SummaryWriter(FLAGS.tb_log_dir)
        tb_loss_step = 0
        tb_mAP_step = 0

    if FLAGS.use_vdl:
        from visualdl import LogWriter
        vdl_writer = LogWriter(FLAGS.vdl_log_dir, sync_cycle=5)

        with vdl_writer.mode("train"):
            scalars = [
                vdl_writer.scalar(loss_name) for loss_name in train_keys
            ]
            mAP_scalar = vdl_writer.scalar("mAP")
        vdl_loss_step = 0
        vdl_mAP_step = 0

    for it in range(start_iter, cfg.max_iters):
        start_time = end_time
        end_time = time.time()
        time_stat.append(end_time - start_time)
        time_cost = np.mean(time_stat)
        eta_sec = (cfg.max_iters - it) * time_cost
        eta = str(datetime.timedelta(seconds=int(eta_sec)))
        outs = exe.run(compiled_train_prog, fetch_list=train_values)
        stats = {k: np.array(v).mean() for k, v in zip(train_keys, outs[:-1])}

        # use tb-paddle to log loss
        if FLAGS.use_tb:
            if it % cfg.log_iter == 0:
                for loss_name, loss_value in stats.items():
                    tb_writer.add_scalar(loss_name, loss_value, tb_loss_step)
                tb_loss_step += 1

        if FLAGS.use_vdl:
            if it % cfg.log_iter == 0:
                for loss_name, scalar in zip(train_keys, scalars):
                    loss_value = stats[loss_name]
                    scalar.add_record(vdl_loss_step, loss_value)
                vdl_loss_step += 1

        train_stats.update(stats)
        logs = train_stats.log()
        if it % cfg.log_iter == 0 and (not FLAGS.dist or trainer_id == 0):
            strs = 'iter: {}, lr: {:.6f}, {}, time: {:.3f}, eta: {}'.format(
                it, np.mean(outs[-1]), logs, time_cost, eta)
            logger.info(strs)


        if (it > 0 and it % cfg.snapshot_iter == 0 or it == cfg.max_iters - 1) \
           and (not FLAGS.dist or trainer_id == 0):
            save_name = str(it) if it != cfg.max_iters - 1 else "model_final"
            checkpoint.save(exe, train_prog, os.path.join(save_dir, save_name))

            if FLAGS.eval:
                # evaluation
                results = eval_run(exe, compiled_eval_prog, eval_loader,
                                   eval_keys, eval_values, eval_cls)
                resolution = None
                if 'mask' in results[0]:
                    resolution = model.mask_head.resolution
                box_ap_stats = eval_results(results, cfg.metric,
                                            cfg.num_classes, resolution,
                                            is_bbox_normalized,
                                            FLAGS.output_eval, map_type,
                                            cfg['EvalReader']['dataset'])

                # use tb_paddle to log mAP
                if FLAGS.use_tb:
                    tb_writer.add_scalar("mAP", box_ap_stats[0], tb_mAP_step)
                    tb_mAP_step += 1

                if FLAGS.use_vdl:
                    mAP_scalar.add_record(vdl_mAP_step, box_ap_stats[0])
                    vdl_mAP_step += 1

                if box_ap_stats[0] > best_box_ap_list[0]:
                    best_box_ap_list[0] = box_ap_stats[0]
                    best_box_ap_list[1] = it
                    checkpoint.save(exe, train_prog,
                                    os.path.join(save_dir, "best_model"))
                logger.info("Best test box ap: {}, in iter: {}".format(
                    best_box_ap_list[0], best_box_ap_list[1]))

    train_loader.reset()
class Trainer(object):
    @classmethod
    def add_cmdline_argument(cls, parser):
        """ Add the cmdline arguments of trainer. """
        group = parser.add_argument_group("Trainer")
        group.add_argument(
            "--use_data_distributed",
            type=str2bool,
            default=False,
            help="Whether to use data distributed for parallel training.")
        group.add_argument(
            "--valid_metric_name",
            type=str,
            default="-loss",
            help=
            "The validation metric determining which checkpoint is the best.")
        group.add_argument("--num_epochs",
                           type=int,
                           default=10,
                           help="Total number of training epochs to perform.")
        group.add_argument(
            "--save_dir",
            type=str,
            required=True,
            help="The output directory where the model will be saved.")
        group.add_argument(
            "--batch_size",
            type=int,
            default=8,
            help="Total batch size for training/evaluation/inference.")
        group.add_argument(
            "--log_steps",
            type=int,
            default=100,
            help="The number of training steps to output current metrics "
            "on past training dataset.")
        group.add_argument(
            "--valid_steps",
            type=int,
            default=2000,
            help="The number of training steps to perform a evaluation "
            "on validation datasets.")
        group.add_argument(
            "--save_checkpoint",
            type=str2bool,
            default=True,
            help="Whether to save one checkpoints for each training epoch.")
        group.add_argument(
            "--save_summary",
            type=str2bool,
            default=False,
            help="Whether to save metrics summary for visualDL module.")
        DataLoader.add_cmdline_argument(group)
        return group

    def __init__(self, model, to_tensor, hparams, logger=None):
        # Use data distributed
        if hparams.use_data_distributed:
            strategy = parallel.prepare_context()
            if strategy is not None:
                parallel_model = parallel.DataParallel(model, strategy)
                model.before_backward_fn = parallel_model.scale_loss
                model.after_backward_fn = parallel_model.apply_collective_grads
                model = parallel_model

        self.model = model
        self.to_tensor = to_tensor

        self.is_decreased_valid_metric = hparams.valid_metric_name[0] == "-"
        self.valid_metric_name = hparams.valid_metric_name[1:]
        self.num_epochs = hparams.num_epochs
        self.save_dir = hparams.save_dir
        self.log_steps = hparams.log_steps
        self.valid_steps = hparams.valid_steps
        self.save_checkpoint = hparams.save_checkpoint
        self.save_summary = hparams.save_summary

        if not os.path.exists(self.save_dir):
            os.makedirs(self.save_dir)

        self.logger = logger or get_logger(
            os.path.join(self.save_dir, "trainer.log"), "trainer")

        if self.save_summary:
            from visualdl import LogWriter
            self.summary_logger = LogWriter(os.path.join(
                self.save_dir, "summary"),
                                            sync_cycle=10000)
            self.train_summary = {}
            self.valid_summary = {}

        self.batch_metrics_tracker = MetricsTracker()
        self.token_metrics_tracker = MetricsTracker()

        self.best_valid_metric = float(
            "inf" if self.is_decreased_valid_metric else "-inf")
        self.epoch = 0
        self.batch_num = 0

    def train_epoch(self,
                    train_iter,
                    valid_iter,
                    infer_iter=None,
                    infer_parse_dict=None):
        """
        Train an epoch.

        @param train_iter
        @type : DataLoader

        @param valid_iter
        @type : DataLoader

        @param infer_iter
        @type : DataLoader

        @param infer_parse_dict
        @type : dict of function
        """
        self.epoch += 1
        num_batches = len(train_iter)
        self.batch_metrics_tracker.clear()
        self.token_metrics_tracker.clear()
        times = []
        for batch_id, (batch, batch_size) in enumerate(train_iter, 1):
            batch = type(batch)(map(lambda kv: (kv[0], self.to_tensor(kv[1])),
                                    batch.items()))
            batch["epoch"] = self.epoch
            batch["num_steps"] = self.batch_num

            # Do a training iteration
            start_time = time.time()
            metrics = self.model(batch, is_training=True)
            token_num = metrics.pop("token_num", None)
            elapsed = time.time() - start_time
            times.append(elapsed)

            batch_metrics = {
                k: v
                for k, v in metrics.items() if "token" not in k
            }
            token_metrics = {k: v for k, v in metrics.items() if "token" in k}
            self.batch_metrics_tracker.update(batch_metrics, batch_size)
            self.token_metrics_tracker.update(token_metrics, token_num)
            self.batch_num += 1

            if self.log_steps and batch_id % self.log_steps == 0:
                batch_metrics_message = self.batch_metrics_tracker.value()
                token_metrics_message = self.token_metrics_tracker.value()
                message_prefix = f"[Train][{self.epoch}][{batch_id}/{num_batches}]"
                avg_time = f"AVG_Time-{sum(times[-self.log_steps:]) / self.log_steps:.3f}"
                message = "   ".join([
                    message_prefix, batch_metrics_message,
                    token_metrics_message, avg_time
                ])
                self.logger.info(message)

            if self.save_summary:
                with self.summary_logger.mode("train"):
                    for k, v in self.batch_metrics_tracker.items():
                        if k not in self.train_summary:
                            self.train_summary[k] = self.summary_logger.scalar(
                                k)
                        scalar = self.train_summary[k]
                        scalar.add_record(self.batch_num, v)
                    for k, v in self.token_metrics_tracker.items():
                        if k not in self.train_summary:
                            self.train_summary[k] = self.summary_logger.scalar(
                                k)
                        scalar = self.train_summary[k]
                        scalar.add_record(self.batch_num, v)

            if self.valid_steps and valid_iter is not None and \
                    batch_id % self.valid_steps == 0:
                self.evaluate(valid_iter)

        if valid_iter is not None:
            self.evaluate(valid_iter)

        if infer_iter is not None and infer_parse_dict is not None:
            self.infer(infer_iter, infer_parse_dict)

        return

    def infer(self, data_iter, parse_dict, num_batches=None):
        """
        Inference interface.

        @param : data_iter
        @type : DataLoader

        @param : parse_dict
        @type : dict of function

        @param : num_batches : the number of batch to infer
        @type : int/None
        """
        self.logger.info("Generation starts ...")
        infer_save_file = os.path.join(self.save_dir,
                                       f"infer_{self.epoch}.result.json")

        # Inference
        infer_results = []
        batch_cnt = 0
        begin_time = time.time()
        for batch, batch_size in tqdm(data_iter, total=num_batches):
            batch = type(batch)(map(lambda kv: (kv[0], self.to_tensor(kv[1])),
                                    batch.items()))

            result = self.model.infer(inputs=batch)
            batch_result = {}

            def to_list(batch):
                """ Parse list. """
                return batch.tolist()

            # parse
            for k in result:
                if k in parse_dict:
                    parse_fn = parse_dict[k]
                else:
                    parse_fn = to_list
                if result[k] is not None:
                    batch_result[k] = parse_fn(result[k])

            for vs in zip(*batch_result.values()):
                infer_result = {}
                for k, v in zip(batch_result.keys(), vs):
                    infer_result[k] = v
                infer_results.append(infer_result)

            batch_cnt += 1
            if batch_cnt == num_batches:
                break

        self.logger.info(f"Saved inference results to {infer_save_file}")
        with open(infer_save_file, "w") as fp:
            json.dump(infer_results, fp, indent=2)
        infer_metrics_tracker = evaluate_generation_result(infer_results)
        metrics_message = infer_metrics_tracker.summary()
        message_prefix = f"[Infer][{self.epoch}]"
        time_cost = f"TIME-{time.time() - begin_time:.3f}"
        message = "   ".join([message_prefix, metrics_message, time_cost])
        self.logger.info(message)
        return

    def evaluate(self, data_iter, need_save=True):
        """
        Evaluation interface

        @param : data_iter
        @type : DataLoader

        @param : need_save
        @type : bool
        """
        if isinstance(self.model, parallel.DataParallel):
            need_save = need_save and parallel.Env().local_rank == 0

        # Evaluation
        begin_time = time.time()
        batch_metrics_tracker = MetricsTracker()
        token_metrics_tracker = MetricsTracker()
        for batch, batch_size in data_iter:
            batch = type(batch)(map(lambda kv: (kv[0], self.to_tensor(kv[1])),
                                    batch.items()))
            metrics = self.model(batch, is_training=False)
            token_num = int(metrics.pop("token_num"))
            batch_metrics = {
                k: v
                for k, v in metrics.items() if "token" not in k
            }
            token_metrics = {k: v for k, v in metrics.items() if "token" in k}
            batch_metrics_tracker.update(batch_metrics, batch_size)
            token_metrics_tracker.update(token_metrics, token_num)
        batch_metrics_message = batch_metrics_tracker.summary()
        token_metrics_message = token_metrics_tracker.summary()
        message_prefix = f"[Valid][{self.epoch}]"
        time_cost = f"TIME-{time.time() - begin_time:.3f}"
        message = "   ".join([
            message_prefix, batch_metrics_message, token_metrics_message,
            time_cost
        ])
        self.logger.info(message)

        if need_save:
            # Check valid metric
            cur_valid_metric = batch_metrics_tracker.get(
                self.valid_metric_name)
            if self.is_decreased_valid_metric:
                is_best = cur_valid_metric < self.best_valid_metric
            else:
                is_best = cur_valid_metric > self.best_valid_metric
            if is_best:
                # Save current best model
                self.best_valid_metric = cur_valid_metric
                best_model_path = os.path.join(self.save_dir, "best.model")
                save(self.model, best_model_path)
                self.logger.info(
                    f"Saved best model to '{best_model_path}' with new best valid metric "
                    f"{self.valid_metric_name.upper()}-{self.best_valid_metric:.3f}"
                )

            # Save checkpoint
            if self.save_checkpoint:
                model_file = os.path.join(self.save_dir,
                                          f"epoch_{self.epoch}.model")
                save(self.model, model_file)

            if self.save_summary:
                with self.summary_logger.mode("valid"):
                    for k, v in self.batch_metrics_tracker.items():
                        if k not in self.valid_summary:
                            self.valid_summary[k] = self.summary_logger.scalar(
                                k)
                        scalar = self.valid_summary[k]
                        scalar.add_record(self.batch_num, v)
                    for k, v in self.token_metrics_tracker.items():
                        if k not in self.valid_summary:
                            self.valid_summary[k] = self.summary_logger.scalar(
                                k)
                        scalar = self.valid_summary[k]
                        scalar.add_record(self.batch_num, v)

        return
Esempio n. 5
0
import random
from visualdl import LogWriter
import ca

logdir='./temp'
logger = LogWriter(logdir,sync_cycle=10)
with logger.mode('train'):
    scalar0 = logger.scalar('scalar0')
for step in range(0,1000):
    scalar0.add_record(step,random.random())
Esempio n. 6
0
class StorageTest(unittest.TestCase):
    def setUp(self):
        self.dir = "./tmp/storage_test"
        self.writer = LogWriter(self.dir, sync_cycle=1).as_mode("train")

    def test_scalar(self):
        print('test write')
        scalar = self.writer.scalar("model/scalar/min")
        # scalar.set_caption("model/scalar/min")
        for i in range(10):
            scalar.add_record(i, float(i))

        print('test read')
        self.reader = LogReader(self.dir)
        with self.reader.mode("train") as reader:
            scalar = reader.scalar("model/scalar/min")
            self.assertEqual(scalar.caption(), "train")
            records = scalar.records()
            ids = scalar.ids()
            self.assertTrue(
                np.equal(records, [float(i) for i in range(10 - 1)]).all())
            self.assertTrue(np.equal(ids, [float(i) for i in range(10)]).all())
            print('records', records)
            print('ids', ids)

    def test_image(self):
        tag = "layer1/layer2/image0"
        image_writer = self.writer.image(tag, 10, 1)
        num_passes = 10
        num_samples = 100
        shape = [10, 10, 3]

        for pass_ in range(num_passes):
            image_writer.start_sampling()
            for ins in range(num_samples):
                data = np.random.random(shape) * 256
                data = np.ndarray.flatten(data)
                image_writer.add_sample(shape, list(data))
            image_writer.finish_sampling()

        self.reader = LogReader(self.dir)
        with self.reader.mode("train") as reader:
            image_reader = reader.image(tag)
            self.assertEqual(image_reader.caption(), tag)
            self.assertEqual(image_reader.num_records(), num_passes)

            image_record = image_reader.record(0, 1)
            self.assertTrue(np.equal(image_record.shape(), shape).all())
            data = image_record.data()
            self.assertEqual(len(data), np.prod(shape))

            image_tags = reader.tags("image")
            self.assertTrue(image_tags)
            self.assertEqual(len(image_tags), 1)

    def test_check_image(self):
        '''
        check whether the storage will keep image data consistent
        '''
        print('check image')
        tag = "layer1/check/image1"
        image_writer = self.writer.image(tag, 10)

        image = Image.open("./dog.jpg")
        shape = [image.size[1], image.size[0], 3]
        origin_data = np.array(image.getdata()).flatten()

        self.reader = LogReader(self.dir)
        with self.reader.mode("train") as reader:

            image_writer.start_sampling()
            image_writer.add_sample(shape, list(origin_data))
            image_writer.finish_sampling()

            # read and check whether the original image will be displayed
            image_reader = reader.image(tag)
            image_record = image_reader.record(0, 0)
            data = image_record.data()
            shape = image_record.shape()

            PIL_image_shape = (shape[0] * shape[1], shape[2])
            data = np.array(data, dtype='uint8').reshape(PIL_image_shape)
            print('origin', origin_data.flatten())
            print('data', data.flatten())
            image = Image.fromarray(data.reshape(shape))
            # manully check the image and found that nothing wrong with the image storage.
            # image.show()

    def test_with_syntax(self):
        with self.writer.mode("train") as writer:
            scalar = writer.scalar("model/scalar/average")
            for i in range(10):
                scalar.add_record(i, float(i))

        self.reader = LogReader(self.dir)
        with self.reader.mode("train") as reader:
            scalar = reader.scalar("model/scalar/average")
            self.assertEqual(scalar.caption(), "train")

    def test_modes(self):
        store = LogWriter(self.dir, sync_cycle=1)

        scalars = []

        for i in range(10):
            with store.mode("mode-%d" % i) as writer:
                scalar = writer.scalar("add/scalar0")
                scalars.append(scalar)

        for scalar in scalars[:-1]:
            for i in range(10):
                scalar.add_record(i, float(i))
Esempio n. 7
0
        model,
        base_learning_rate=0.1,
        policy="step",
        stepsize=1,
        gamma=0.999,
    )


# create VisualDL logger
logdir = "/workspace"
logger = LogWriter(logdir, sync_cycle=100)

# mark the components with 'train' label.
with logger.mode("train"):
    # create a scalar component called 'scalars/'
    scalar_caffe2_mnist_train_loss = logger.scalar(
        "scalars/scalar_caffe2_mnist_train_loss")
    scalar_caffe2_mnist_train_accuracy = logger.scalar(
        "scalars/scalar_caffe2_mnist_train_accuracy")
    histogram0 = logger.histogram("histogram/histogram0", num_buckets=50)
    histogram1 = logger.histogram("histogram/histogram1", num_buckets=50)

# Specify the data will be input in NCHW order
#  (i.e. [batch_size, num_channels, height, width])
arg_scope = {"order": "NCHW"}
# Create the model helper for the train model
train_model = model_helper.ModelHelper(name="mnist_train", arg_scope=arg_scope)
# Specify the input is from the train lmdb
data, label = AddInput(train_model,
                       batch_size=64,
                       db=os.path.join(data_folder, 'mnist-train-nchw-lmdb'),
                       db_type='lmdb')
Esempio n. 8
0
    # get data
    g, label, train_idx, valid_idx, test_idx, evaluator = get_graph_data(
                                                            d_name=d_name, 
                                                            mini_data=eval(args.mini_data))
    
    
    # create log writer
    log_writer = LogWriter(args.log_path, sync_cycle=10)
    with log_writer.mode("train") as logger:
        log_train_loss_epoch = logger.scalar("loss")
        log_train_rocauc_epoch = logger.scalar("rocauc")
    with log_writer.mode("valid") as logger:
        log_valid_loss_epoch = logger.scalar("loss")
        log_valid_rocauc_epoch = logger.scalar("rocauc")
    log_text = log_writer.text("text")
    log_time = log_writer.scalar("time")
    log_test_loss = log_writer.scalar("test_loss")
    log_test_rocauc = log_writer.scalar("test_rocauc")

    
    # training
    samples = [25, 10] # 2-hop sample size
    batch_size = args.batch_size
    sample_workers = 1
                        
    place = fluid.CUDAPlace(args.gpu_id) if args.use_gpu else fluid.CPUPlace()           
    train_program = fluid.Program()
    startup_program = fluid.Program()

    with fluid.program_guard(train_program, startup_program):
        gw = pgl.graph_wrapper.GraphWrapper(
Esempio n. 9
0
    img = img / 2 + 0.5  # unnormalize
    npimg = img.numpy()
    fig, ax = plt.subplots()
    plt.imshow(np.transpose(npimg, (1, 2, 0)))
    # we can either show the image or save it locally
    # plt.show()
    fig.savefig('out' + str(np.random.randint(0, 10000)) + '.pdf')


logdir = "./workspace"
logger = LogWriter(logdir, sync_cycle=100)

# mark the components with 'train' label.
with logger.mode("train"):
    # create a scalar component called 'scalars/'
    scalar_pytorch_train_loss = logger.scalar(
        "scalars/scalar_pytorch_train_loss")
    image1 = logger.image("images/image1", 1)
    image2 = logger.image("images/image2", 1)
    histogram0 = logger.histogram("histogram/histogram0", num_buckets=100)

# get some random training images
dataiter = iter(trainloader)
images, labels = dataiter.next()

# show images
imshow(torchvision.utils.make_grid(images))
# print labels
print(' '.join('%5s' % classes[labels[j]] for j in range(4)))


# Define a Convolution Neural Network
Esempio n. 10
0
def get_result(test_for):
    """
    get log from db and produce protobuf logs
    :return:
    """
    result_logs = bm.ViewVisualDLLog.objects.filter(test_for=test_for)
    if not result_logs:
        print("no {} results in latest paddle version".format(test_for))
        return
    paddle_version = result_logs[0].paddle_version if result_logs else ''
    version_path = os.path.join(conf.ROOT_PATH, 'visualdl_logs',
                                paddle_version)
    cmd = "if [ ! -d %s ]; then mkdir %s; fi" % (version_path, version_path)
    os.system(cmd)
    logdir = os.path.join(version_path, test_for)
    #logdir_des = conf.ROOT_PATH + '/visualdl_logs/latest'
    logdir_des = os.path.join(conf.ROOT_PATH, 'visualdl_logs', 'latest',
                              test_for)
    cmd = "if [ -e %s ]; then rm -rf %s; fi; mkdir %s" % (logdir, logdir,
                                                          logdir)
    os.system(cmd)

    logge = LogWriter(logdir, sync_cycle=1)

    def sample_log(result_log_dict, model, run_machine_type):
        """sample log from db log depends on model and run_machine_type"""
        if model == 'ocr':
            sample_ratio = 1
            if run_machine_type.startswith("MULTI_MACHINE_MULTI"):
                sample_ratio = 62
            elif run_machine_type.startswith("MULTI_MACHINE_ONE"):
                sample_ratio = 15
            elif run_machine_type.startswith("ONE"):
                sample_ratio = 15
            elif run_machine_type.startswith("FOUR"):
                sample_ratio = 15
            elif run_machine_type.startswith("MULTI_GPU"):
                sample_ratio = 15

            for k, v in result_log_dict.items():
                sample_list = [
                    v[index] for index in range(len(v))
                    if index % sample_ratio == 0
                ]
                result_log_dict[k] = [[index + 1, sample_list[index][1]]
                                      for index in range(len(sample_list))]

        return result_log_dict

    for log in result_logs:
        model = log.model
        test_for = log.test_for
        #code_from = log.code_from
        run_rpc_type = log.run_rpc_type.lower()
        run_machine_type = log.run_machine_type.lower()
        tag = "%s_%s_%s" % (test_for.split('_')[0], run_machine_type,
                            run_rpc_type)
        result_log_dict = json.loads(log.result_log)
        #sample_log_dict = sample_log(result_log_dict, model, run_machine_type)
        print("visualdl_paint cur is: %s_%s_%s" %
              (model, tag, log.cloud_job_id))
        for indicant, values in result_log_dict.items():
            with logge.mode(indicant) as logge:
                val_tag = logge.scalar("%s/%s" % (model, tag))
                for step, value in values:
                    if value != 'NaN':
                        val_tag.add_record(int(step), float(value))

    cmd = "rm -rf %s && cp -r %s %s" % (logdir_des, logdir, logdir_des)
    os.system(cmd)
Esempio n. 11
0
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))

model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adadelta(),
              metrics=['accuracy'])

# create VisualDL logger
logdir = "/workspace"
logger = LogWriter(logdir, sync_cycle=100)

# mark the components with 'train' label.
with logger.mode("train"):
    # create a scalar component called 'scalars/'
    scalar_keras_train_loss = logger.scalar(
        "scalars/scalar_keras_mnist_train_loss")
    image_input = logger.image("images/input", 1)
    image0 = logger.image("images/image0", 1)
    image1 = logger.image("images/image1", 1)
    histogram0 = logger.histogram("histogram/histogram0", num_buckets=50)
    histogram1 = logger.histogram("histogram/histogram1", num_buckets=50)

train_step = 0


class LossHistory(keras.callbacks.Callback):
    def on_batch_end(self, batch, logs={}):
        global train_step

        # Scalar
        scalar_keras_train_loss.add_record(train_step, logs.get('loss'))