Exemple #1
0
    def PartialFcJob(labels: oft.Numpy.Placeholder(
        (batch_size, ), dtype=type_name_to_flow_type[label_type])):
        with flow.scope.placement(device_type, "0:0"):
            x = flow.get_variable(
                "x-weight",
                shape=(num_classes, 128),
                dtype=flow.float,
                initializer=flow.random_uniform_initializer(minval=-10,
                                                            maxval=10),
                trainable=True,
            )
        with flow.scope.placement(device_type, "0:0-3"):
            lebels_distribute = flow.distribute.broadcast()
            weight_distribute = flow.distribute.split(0)
            (
                maped_label,
                sampled_label,
                sampled_weight,
            ) = flow.distributed_partial_fc_sample(
                weight=x.with_distribute(weight_distribute),
                label=labels.with_distribute(lebels_distribute),
                num_sample=num_sample,
            )
        with flow.scope.placement(device_type, "0:0"):
            sampled_weight = flow.identity(sampled_weight)
            loss = flow.math.square(sampled_weight)
            flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler(
                [], [1e-4]),
                               momentum=0).minimize(loss)

            flow.watch(x, test_global_storage.Setter("x"))
            flow.watch_diff(x, test_global_storage.Setter("x_diff"))
            flow.watch_diff(sampled_weight,
                            test_global_storage.Setter("sampled_weight_diff"))
        return x, maped_label, sampled_label, sampled_weight
Exemple #2
0
 def test_parital_fc(test_case):
     p = flow.env.all_device_placement("cuda")
     w = flow.randn(50000, 128, placement=p, sbp=flow.sbp.broadcast)
     label = flow.randint(0,
                          50000, (512, ),
                          placement=p,
                          sbp=flow.sbp.broadcast)
     num_sample = 5000
     out = flow.distributed_partial_fc_sample(w, label, num_sample)
     test_case.assertTrue(out[0].shape == flow.Size([512]))
     test_case.assertTrue(out[1].shape == flow.Size([5000]))
     test_case.assertTrue(out[2].shape == flow.Size([5000, 128]))
 def test_parital_fc(test_case):
     p = flow.env.all_device_placement("cuda")
     w = flow.randn(
         50000, 128, placement=p, sbp=flow.sbp.broadcast, requires_grad=True
     )
     label = flow.randint(0, 50000, (512,), placement=p, sbp=flow.sbp.broadcast)
     num_sample = 5000
     out = flow.distributed_partial_fc_sample(w, label, num_sample)
     test_case.assertTrue(out[0].shape == flow.Size([512]))
     test_case.assertTrue(out[1].shape == flow.Size([5000]))
     test_case.assertTrue(out[2].shape == flow.Size([5000, 128]))
     # test gradient function
     sample_weight = out[2]
     sample_weight.sum().backward()
Exemple #4
0
 def forward(self, x, label):
     x = flow.nn.functional.l2_normalize(input=x, dim=1, epsilon=1e-10)
     if self.partial_fc:
         (
             mapped_label,
             sampled_label,
             sampled_weight,
         ) = flow.distributed_partial_fc_sample(
             weight=self.weight, label=label, num_sample=self.total_num_sample,
         )
         label = mapped_label
         weight = sampled_weight
     else:
         weight = self.weight
     weight = flow.nn.functional.l2_normalize(input=weight, dim=1, epsilon=1e-10)
     x = flow.matmul(x, weight, transpose_b=True)
     if x.is_consistent:
         return x, label
     else:
         return x
Exemple #5
0
    def get_symbol_train_job():
        if cfg.use_synthetic_data:
            (labels, images) = load_synthetic(cfg)
        else:
            labels, images = load_train_dataset(cfg)
        image_size = images.shape[2:]
        assert len(
            image_size) == 2, "The length of image size must be equal to 2."
        assert image_size[0] == image_size[
            1], "image_size[0] should be equal to image_size[1]."

        embedding = get_model(cfg.network, images, cfg)

        def _get_initializer():
            return flow.random_normal_initializer(mean=0.0, stddev=0.01)

        trainable = True

        if cfg.model_parallel and cfg.device_num_per_node > 1:
            logging.info("Training is using model parallelism now.")
            labels = labels.with_distribute(flow.distribute.broadcast())
            fc1_distribute = flow.distribute.broadcast()
            fc7_data_distribute = flow.distribute.split(1)
            fc7_model_distribute = flow.distribute.split(0)
        else:
            fc1_distribute = flow.distribute.split(0)
            fc7_data_distribute = flow.distribute.split(0)
            fc7_model_distribute = flow.distribute.broadcast()
        weight_regularizer = flow.regularizers.l2(0.0005)
        fc7_weight = flow.get_variable(
            name="fc7-weight",
            shape=(cfg.num_classes, embedding.shape[1]),
            dtype=embedding.dtype,
            initializer=_get_initializer(),
            regularizer=weight_regularizer,
            trainable=trainable,
            model_name="weight",
            distribute=fc7_model_distribute,
        )
        if cfg.partial_fc and cfg.model_parallel:
            logging.info(
                "Training is using model parallelism and optimized by partial_fc now."
            )

            size = cfg.device_num_per_node * cfg.num_nodes
            num_local = (cfg.num_classes + size - 1) // size
            num_sample = int(num_local * cfg.sample_rate)
            total_num_sample = num_sample * size
            (
                mapped_label,
                sampled_label,
                sampled_weight,
            ) = flow.distributed_partial_fc_sample(
                weight=fc7_weight,
                label=labels,
                num_sample=total_num_sample,
            )
            labels = mapped_label
            fc7_weight = sampled_weight
        fc7_weight = flow.math.l2_normalize(input=fc7_weight,
                                            axis=1,
                                            epsilon=1e-10)
        fc1 = flow.math.l2_normalize(input=embedding, axis=1, epsilon=1e-10)
        fc7 = flow.matmul(a=fc1.with_distribute(fc1_distribute),
                          b=fc7_weight,
                          transpose_b=True)
        fc7 = fc7.with_distribute(fc7_data_distribute)

        if cfg.loss == "cosface":
            fc7 = (
                flow.combined_margin_loss(fc7, labels, m1=1, m2=0.0, m3=0.4) *
                64)
        elif cfg.loss == "arcface":
            fc7 = (
                flow.combined_margin_loss(fc7, labels, m1=1, m2=0.5, m3=0.0) *
                64)
        else:
            raise ValueError()

        fc7 = fc7.with_distribute(fc7_data_distribute)

        loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
            labels, fc7, name="softmax_loss")

        lr_scheduler = flow.optimizer.PiecewiseScalingScheduler(
            base_lr=cfg.lr,
            boundaries=cfg.lr_steps,
            scale=cfg.lr_scales,
            warmup=None)
        flow.optimizer.SGD(
            lr_scheduler,
            momentum=cfg.momentum if cfg.momentum > 0 else None,
        ).minimize(loss)

        return loss
Exemple #6
0
    def get_symbol_train_job():
        if args.use_synthetic_data:
            (labels, images) = ofrecord_util.load_synthetic(args)
        else:
            labels, images = ofrecord_util.load_train_dataset(args)
        image_size = images.shape[1:-1]
        assert len(
            image_size) == 2, "The length of image size must be equal to 2."
        assert image_size[0] == image_size[
            1], "image_size[0] should be equal to image_size[1]."
        print("train image_size: ", image_size)
        embedding = eval(config.net_name).get_symbol(images)

        def _get_initializer():
            return flow.random_normal_initializer(mean=0.0, stddev=0.01)

        trainable = True
        if config.loss_name == "softmax":
            if args.model_parallel:
                print("Training is using model parallelism now.")
                labels = labels.with_distribute(flow.distribute.broadcast())
                fc1_distribute = flow.distribute.broadcast()
                fc7_data_distribute = flow.distribute.split(1)
                fc7_model_distribute = flow.distribute.split(0)
            else:
                fc1_distribute = flow.distribute.split(0)
                fc7_data_distribute = flow.distribute.split(0)
                fc7_model_distribute = flow.distribute.broadcast()

            fc7 = flow.layers.dense(
                inputs=embedding.with_distribute(fc1_distribute),
                units=config.num_classes,
                activation=None,
                use_bias=False,
                kernel_initializer=_get_initializer(),
                bias_initializer=None,
                trainable=trainable,
                name="fc7",
                model_distribute=fc7_model_distribute,
            )
            fc7 = fc7.with_distribute(fc7_data_distribute)
        elif config.loss_name == "margin_softmax":
            if args.model_parallel:
                print("Training is using model parallelism now.")
                labels = labels.with_distribute(flow.distribute.broadcast())
                fc1_distribute = flow.distribute.broadcast()
                fc7_data_distribute = flow.distribute.split(1)
                fc7_model_distribute = flow.distribute.split(0)
            else:
                fc1_distribute = flow.distribute.split(0)
                fc7_data_distribute = flow.distribute.split(0)
                fc7_model_distribute = flow.distribute.broadcast()
            fc7_weight = flow.get_variable(
                name="fc7-weight",
                shape=(config.num_classes, embedding.shape[1]),
                dtype=embedding.dtype,
                initializer=_get_initializer(),
                regularizer=None,
                trainable=trainable,
                model_name="weight",
                distribute=fc7_model_distribute,
            )
            if args.partial_fc and args.model_parallel:
                print(
                    "Training is using model parallelism and optimized by partial_fc now."
                )
                (
                    mapped_label,
                    sampled_label,
                    sampled_weight,
                ) = flow.distributed_partial_fc_sample(
                    weight=fc7_weight,
                    label=labels,
                    num_sample=args.total_num_sample,
                )
                labels = mapped_label
                fc7_weight = sampled_weight
            fc7_weight = flow.math.l2_normalize(input=fc7_weight,
                                                axis=1,
                                                epsilon=1e-10)
            fc1 = flow.math.l2_normalize(input=embedding,
                                         axis=1,
                                         epsilon=1e-10)
            fc7 = flow.matmul(a=fc1.with_distribute(fc1_distribute),
                              b=fc7_weight,
                              transpose_b=True)
            fc7 = fc7.with_distribute(fc7_data_distribute)
            fc7 = (flow.combined_margin_loss(fc7,
                                             labels,
                                             m1=config.loss_m1,
                                             m2=config.loss_m2,
                                             m3=config.loss_m3) *
                   config.loss_s)
            fc7 = fc7.with_distribute(fc7_data_distribute)
        else:
            raise NotImplementedError

        loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
            labels, fc7, name="softmax_loss")

        lr_scheduler = flow.optimizer.PiecewiseScalingScheduler(
            base_lr=args.lr,
            boundaries=args.lr_steps,
            scale=args.scales,
            warmup=None)
        flow.optimizer.SGDW(
            lr_scheduler,
            momentum=args.momentum if args.momentum > 0 else None,
            weight_decay=args.weight_decay).minimize(loss)

        return loss