def PartialFcJob(labels: oft.Numpy.Placeholder( (batch_size, ), dtype=type_name_to_flow_type[label_type])): with flow.scope.placement(device_type, "0:0"): x = flow.get_variable( "x-weight", shape=(num_classes, 128), dtype=flow.float, initializer=flow.random_uniform_initializer(minval=-10, maxval=10), trainable=True, ) with flow.scope.placement(device_type, "0:0-3"): lebels_distribute = flow.distribute.broadcast() weight_distribute = flow.distribute.split(0) ( maped_label, sampled_label, sampled_weight, ) = flow.distributed_partial_fc_sample( weight=x.with_distribute(weight_distribute), label=labels.with_distribute(lebels_distribute), num_sample=num_sample, ) with flow.scope.placement(device_type, "0:0"): sampled_weight = flow.identity(sampled_weight) loss = flow.math.square(sampled_weight) flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler( [], [1e-4]), momentum=0).minimize(loss) flow.watch(x, test_global_storage.Setter("x")) flow.watch_diff(x, test_global_storage.Setter("x_diff")) flow.watch_diff(sampled_weight, test_global_storage.Setter("sampled_weight_diff")) return x, maped_label, sampled_label, sampled_weight
def test_parital_fc(test_case): p = flow.env.all_device_placement("cuda") w = flow.randn(50000, 128, placement=p, sbp=flow.sbp.broadcast) label = flow.randint(0, 50000, (512, ), placement=p, sbp=flow.sbp.broadcast) num_sample = 5000 out = flow.distributed_partial_fc_sample(w, label, num_sample) test_case.assertTrue(out[0].shape == flow.Size([512])) test_case.assertTrue(out[1].shape == flow.Size([5000])) test_case.assertTrue(out[2].shape == flow.Size([5000, 128]))
def test_parital_fc(test_case): p = flow.env.all_device_placement("cuda") w = flow.randn( 50000, 128, placement=p, sbp=flow.sbp.broadcast, requires_grad=True ) label = flow.randint(0, 50000, (512,), placement=p, sbp=flow.sbp.broadcast) num_sample = 5000 out = flow.distributed_partial_fc_sample(w, label, num_sample) test_case.assertTrue(out[0].shape == flow.Size([512])) test_case.assertTrue(out[1].shape == flow.Size([5000])) test_case.assertTrue(out[2].shape == flow.Size([5000, 128])) # test gradient function sample_weight = out[2] sample_weight.sum().backward()
def forward(self, x, label): x = flow.nn.functional.l2_normalize(input=x, dim=1, epsilon=1e-10) if self.partial_fc: ( mapped_label, sampled_label, sampled_weight, ) = flow.distributed_partial_fc_sample( weight=self.weight, label=label, num_sample=self.total_num_sample, ) label = mapped_label weight = sampled_weight else: weight = self.weight weight = flow.nn.functional.l2_normalize(input=weight, dim=1, epsilon=1e-10) x = flow.matmul(x, weight, transpose_b=True) if x.is_consistent: return x, label else: return x
def get_symbol_train_job(): if cfg.use_synthetic_data: (labels, images) = load_synthetic(cfg) else: labels, images = load_train_dataset(cfg) image_size = images.shape[2:] assert len( image_size) == 2, "The length of image size must be equal to 2." assert image_size[0] == image_size[ 1], "image_size[0] should be equal to image_size[1]." embedding = get_model(cfg.network, images, cfg) def _get_initializer(): return flow.random_normal_initializer(mean=0.0, stddev=0.01) trainable = True if cfg.model_parallel and cfg.device_num_per_node > 1: logging.info("Training is using model parallelism now.") labels = labels.with_distribute(flow.distribute.broadcast()) fc1_distribute = flow.distribute.broadcast() fc7_data_distribute = flow.distribute.split(1) fc7_model_distribute = flow.distribute.split(0) else: fc1_distribute = flow.distribute.split(0) fc7_data_distribute = flow.distribute.split(0) fc7_model_distribute = flow.distribute.broadcast() weight_regularizer = flow.regularizers.l2(0.0005) fc7_weight = flow.get_variable( name="fc7-weight", shape=(cfg.num_classes, embedding.shape[1]), dtype=embedding.dtype, initializer=_get_initializer(), regularizer=weight_regularizer, trainable=trainable, model_name="weight", distribute=fc7_model_distribute, ) if cfg.partial_fc and cfg.model_parallel: logging.info( "Training is using model parallelism and optimized by partial_fc now." ) size = cfg.device_num_per_node * cfg.num_nodes num_local = (cfg.num_classes + size - 1) // size num_sample = int(num_local * cfg.sample_rate) total_num_sample = num_sample * size ( mapped_label, sampled_label, sampled_weight, ) = flow.distributed_partial_fc_sample( weight=fc7_weight, label=labels, num_sample=total_num_sample, ) labels = mapped_label fc7_weight = sampled_weight fc7_weight = flow.math.l2_normalize(input=fc7_weight, axis=1, epsilon=1e-10) fc1 = flow.math.l2_normalize(input=embedding, axis=1, epsilon=1e-10) fc7 = flow.matmul(a=fc1.with_distribute(fc1_distribute), b=fc7_weight, transpose_b=True) fc7 = fc7.with_distribute(fc7_data_distribute) if cfg.loss == "cosface": fc7 = ( flow.combined_margin_loss(fc7, labels, m1=1, m2=0.0, m3=0.4) * 64) elif cfg.loss == "arcface": fc7 = ( flow.combined_margin_loss(fc7, labels, m1=1, m2=0.5, m3=0.0) * 64) else: raise ValueError() fc7 = fc7.with_distribute(fc7_data_distribute) loss = flow.nn.sparse_softmax_cross_entropy_with_logits( labels, fc7, name="softmax_loss") lr_scheduler = flow.optimizer.PiecewiseScalingScheduler( base_lr=cfg.lr, boundaries=cfg.lr_steps, scale=cfg.lr_scales, warmup=None) flow.optimizer.SGD( lr_scheduler, momentum=cfg.momentum if cfg.momentum > 0 else None, ).minimize(loss) return loss
def get_symbol_train_job(): if args.use_synthetic_data: (labels, images) = ofrecord_util.load_synthetic(args) else: labels, images = ofrecord_util.load_train_dataset(args) image_size = images.shape[1:-1] assert len( image_size) == 2, "The length of image size must be equal to 2." assert image_size[0] == image_size[ 1], "image_size[0] should be equal to image_size[1]." print("train image_size: ", image_size) embedding = eval(config.net_name).get_symbol(images) def _get_initializer(): return flow.random_normal_initializer(mean=0.0, stddev=0.01) trainable = True if config.loss_name == "softmax": if args.model_parallel: print("Training is using model parallelism now.") labels = labels.with_distribute(flow.distribute.broadcast()) fc1_distribute = flow.distribute.broadcast() fc7_data_distribute = flow.distribute.split(1) fc7_model_distribute = flow.distribute.split(0) else: fc1_distribute = flow.distribute.split(0) fc7_data_distribute = flow.distribute.split(0) fc7_model_distribute = flow.distribute.broadcast() fc7 = flow.layers.dense( inputs=embedding.with_distribute(fc1_distribute), units=config.num_classes, activation=None, use_bias=False, kernel_initializer=_get_initializer(), bias_initializer=None, trainable=trainable, name="fc7", model_distribute=fc7_model_distribute, ) fc7 = fc7.with_distribute(fc7_data_distribute) elif config.loss_name == "margin_softmax": if args.model_parallel: print("Training is using model parallelism now.") labels = labels.with_distribute(flow.distribute.broadcast()) fc1_distribute = flow.distribute.broadcast() fc7_data_distribute = flow.distribute.split(1) fc7_model_distribute = flow.distribute.split(0) else: fc1_distribute = flow.distribute.split(0) fc7_data_distribute = flow.distribute.split(0) fc7_model_distribute = flow.distribute.broadcast() fc7_weight = flow.get_variable( name="fc7-weight", shape=(config.num_classes, embedding.shape[1]), dtype=embedding.dtype, initializer=_get_initializer(), regularizer=None, trainable=trainable, model_name="weight", distribute=fc7_model_distribute, ) if args.partial_fc and args.model_parallel: print( "Training is using model parallelism and optimized by partial_fc now." ) ( mapped_label, sampled_label, sampled_weight, ) = flow.distributed_partial_fc_sample( weight=fc7_weight, label=labels, num_sample=args.total_num_sample, ) labels = mapped_label fc7_weight = sampled_weight fc7_weight = flow.math.l2_normalize(input=fc7_weight, axis=1, epsilon=1e-10) fc1 = flow.math.l2_normalize(input=embedding, axis=1, epsilon=1e-10) fc7 = flow.matmul(a=fc1.with_distribute(fc1_distribute), b=fc7_weight, transpose_b=True) fc7 = fc7.with_distribute(fc7_data_distribute) fc7 = (flow.combined_margin_loss(fc7, labels, m1=config.loss_m1, m2=config.loss_m2, m3=config.loss_m3) * config.loss_s) fc7 = fc7.with_distribute(fc7_data_distribute) else: raise NotImplementedError loss = flow.nn.sparse_softmax_cross_entropy_with_logits( labels, fc7, name="softmax_loss") lr_scheduler = flow.optimizer.PiecewiseScalingScheduler( base_lr=args.lr, boundaries=args.lr_steps, scale=args.scales, warmup=None) flow.optimizer.SGDW( lr_scheduler, momentum=args.momentum if args.momentum > 0 else None, weight_decay=args.weight_decay).minimize(loss) return loss