def custom_relu_static_pe(func, device, dtype, np_x, use_func=True): paddle.enable_static() paddle.set_device(device) places = static.cpu_places() if device is 'cpu' else static.cuda_places() with static.scope_guard(static.Scope()): with static.program_guard(static.Program()): x = static.data(name='X', shape=[None, 8], dtype=dtype) x.stop_gradient = False out = func(x) if use_func else paddle.nn.functional.relu(x) static.append_backward(out) exe = static.Executor() exe.run(static.default_startup_program()) # in static mode, x data has been covered by out compiled_prog = static.CompiledProgram( static.default_main_program()).with_data_parallel( loss_name=out.name, places=places) out_v = exe.run(compiled_prog, feed={'X': np_x}, fetch_list=[out.name]) paddle.disable_static() return out_v
def run_inference(drop_last): loader = paddle.io.DataLoader.from_generator(feed_list=[x], capacity=8, drop_last=drop_last) loader.set_batch_generator(batch_generator, static.cpu_places()) exe = static.Executor(paddle.CPUPlace()) prog = static.CompiledProgram(static.default_main_program()) prog = prog.with_data_parallel() result = [] for data in loader(): each_ret, = exe.run(prog, feed=data, fetch_list=[y]) result.extend(each_ret) return result
def test_relu2_static(device, dtype, use_custom=True): paddle.enable_static() paddle.set_device(device) with static.scope_guard(static.Scope()): with static.program_guard(static.Program()): x = static.data(name='X', shape=[None, 8], dtype=dtype) x.stop_gradient = False out = custom_relu_op_rf.relu2( x) if use_custom else paddle.nn.functional.relu(x) static.append_backward(out) print(static.default_main_program()) places = static.cuda_places() print(places) exe = static.Executor() compiled_prog = static.CompiledProgram( static.default_main_program()).with_data_parallel( loss_name=out.name, places=static.cuda_places()) x = np.random.uniform(-1, 1, [4, 8]).astype(dtype) out, = exe.run(compiled_prog, feed={'X': x}, fetch_list=[out.name]) print(out)
image = static.data(name='image', shape=[None, 784], dtype='float32') label = static.data(name='label', shape=[None, 1], dtype='int64') # Define DataLoader loader = paddle.io.DataLoader.from_generator(feed_list=[image, label], capacity=16, iterable=ITERABLE) # Define network loss = simple_net(image, label) # Set data source of DataLoader # # If DataLoader is iterable, places must be given and the number of places must be the same with device number. # - If you are using GPU, call `paddle.static.cuda_places()` to get all GPU places. # - If you are using CPU, call `paddle.static.cpu_places()` to get all CPU places. # # If DataLoader is not iterable, places can be None. places = static.cuda_places() if USE_GPU else static.cpu_places() set_data_source(loader, places) exe = static.Executor(places[0]) exe.run(static.default_startup_program()) prog = static.CompiledProgram( static.default_main_program()).with_data_parallel(loss_name=loss.name) if loader.iterable: train_iterable(exe, prog, loss, loader) else: train_non_iterable(exe, prog, loss, loader)
def search_mobilenetv2(config, args, image_size, is_server=True): places = static.cuda_places() if args.use_gpu else static.cpu_places() place = places[0] if is_server: ### start a server and a client rl_nas = RLNAS(key='lstm', configs=config, is_sync=False, server_addr=(args.server_address, args.port), controller_batch_size=1, controller_decay_steps=1000, controller_decay_rate=0.8, lstm_num_layers=1, hidden_size=10, temperature=1.0) else: ### start a client rl_nas = RLNAS(key='lstm', configs=config, is_sync=False, server_addr=(args.server_address, args.port), lstm_num_layers=1, hidden_size=10, temperature=1.0, controller_batch_size=1, controller_decay_steps=1000, controller_decay_rate=0.8, is_server=False) image_shape = [3, image_size, image_size] if args.data == 'cifar10': transform = T.Compose([T.Transpose(), T.Normalize([127.5], [127.5])]) train_dataset = paddle.vision.datasets.Cifar10(mode='train', transform=transform, backend='cv2') val_dataset = paddle.vision.datasets.Cifar10(mode='test', transform=transform, backend='cv2') elif args.data == 'imagenet': train_dataset = imagenet_reader.ImageNetDataset(mode='train') val_dataset = imagenet_reader.ImageNetDataset(mode='val') for step in range(args.search_steps): archs = rl_nas.next_archs(1)[0][0] train_program = static.Program() test_program = static.Program() startup_program = static.Program() train_loader, avg_cost, acc_top1, acc_top5 = build_program( train_program, startup_program, image_shape, train_dataset, archs, args, places) test_loader, test_avg_cost, test_acc_top1, test_acc_top5 = build_program( test_program, startup_program, image_shape, val_dataset, archs, args, place, is_test=True) test_program = test_program.clone(for_test=True) exe = static.Executor(place) exe.run(startup_program) build_strategy = static.BuildStrategy() train_compiled_program = static.CompiledProgram( train_program).with_data_parallel(loss_name=avg_cost.name, build_strategy=build_strategy) for epoch_id in range(args.retain_epoch): for batch_id, data in enumerate(train_loader()): fetches = [avg_cost.name] s_time = time.time() outs = exe.run(train_compiled_program, feed=data, fetch_list=fetches)[0] batch_time = time.time() - s_time if batch_id % 10 == 0: _logger.info( 'TRAIN: steps: {}, epoch: {}, batch: {}, cost: {}, batch_time: {}ms' .format(step, epoch_id, batch_id, outs[0], batch_time)) reward = [] for batch_id, data in enumerate(test_loader()): test_fetches = [ test_avg_cost.name, test_acc_top1.name, test_acc_top5.name ] batch_reward = exe.run(test_program, feed=data, fetch_list=test_fetches) reward_avg = np.mean(np.array(batch_reward), axis=1) reward.append(reward_avg) _logger.info( 'TEST: step: {}, batch: {}, avg_cost: {}, acc_top1: {}, acc_top5: {}' .format(step, batch_id, batch_reward[0], batch_reward[1], batch_reward[2])) finally_reward = np.mean(np.array(reward), axis=0) _logger.info( 'FINAL TEST: avg_cost: {}, acc_top1: {}, acc_top5: {}'.format( finally_reward[0], finally_reward[1], finally_reward[2])) rl_nas.reward(np.float32(finally_reward[1]))
import os import paddle import paddle.static as static paddle.enable_static() os.environ['CPU_NUM'] = str(2) places = static.cpu_places() data = static.data(name="x", shape=[None, 1], dtype="float32") hidden = static.nn.fc(input=data, size=10) loss = paddle.mean(hidden) paddle.optimizer.SGD(learning_rate=0.01).minimize(loss) build_strategy = static.BuildStrategy() build_strategy.enable_inplace = True build_strategy.memory_optimize = True build_strategy.reduce_strategy = static.BuildStrategy.ReduceStrategy.Reduce program = static.CompiledProgram(static.default_main_program()) program = program.with_data_parallel(loss_name=loss.name, build_strategy=build_strategy, places=places)
# failed by an exception. if not use_cuda: os.environ['CPU_NUM'] = str(2) exe = static.Executor(place) data = static.data(name='X', shape=[None, 1], dtype='float32') hidden = static.nn.fc(input=data, size=10) loss = paddle.mean(hidden) test_program = static.default_main_program().clone(for_test=True) paddle.optimizer.SGD(learning_rate=0.01).minimize(loss) exe.run(static.default_startup_program()) compiled_train_prog = static.CompiledProgram( static.default_main_program()).with_data_parallel(loss_name=loss.name, places=parallel_places) # NOTE: if not set share_vars_from=compiled_train_prog, # the parameters used in test process are different with # the parameters used by train process compiled_test_prog = static.CompiledProgram(test_program).with_data_parallel( share_vars_from=compiled_train_prog, places=parallel_places) train_data = numpy.random.random(size=(10, 1)).astype('float32') loss_data, = exe.run(compiled_train_prog, feed={"X": train_data}, fetch_list=[loss.name]) test_data = numpy.random.random(size=(10, 1)).astype('float32') loss_data, = exe.run(compiled_test_prog, feed={"X": test_data}, fetch_list=[loss.name])
def search_mobilenetv2(config, args, image_size, is_server=True): places = static.cuda_places() if args.use_gpu else static.cpu_places() place = places[0] if is_server: ### start a server and a client rl_nas = RLNAS( key='ddpg', configs=config, is_sync=False, obs_dim=26, ### step + length_of_token server_addr=(args.server_address, args.port)) else: ### start a client rl_nas = RLNAS(key='ddpg', configs=config, is_sync=False, obs_dim=26, server_addr=(args.server_address, args.port), is_server=False) image_shape = [3, image_size, image_size] if args.data == 'cifar10': transform = T.Compose([T.Transpose(), T.Normalize([127.5], [127.5])]) train_dataset = paddle.vision.datasets.Cifar10(mode='train', transform=transform, backend='cv2') val_dataset = paddle.vision.datasets.Cifar10(mode='test', transform=transform, backend='cv2') elif args.data == 'imagenet': train_dataset = imagenet_reader.ImageNetDataset(mode='train') val_dataset = imagenet_reader.ImageNetDataset(mode='val') for step in range(args.search_steps): if step == 0: action_prev = [1. for _ in rl_nas.range_tables] else: action_prev = rl_nas.tokens[0] obs = [step] obs.extend(action_prev) archs = rl_nas.next_archs(obs=obs)[0][0] train_program = static.Program() test_program = static.Program() startup_program = static.Program() train_loader, avg_cost, acc_top1, acc_top5 = build_program( train_program, startup_program, image_shape, train_dataset, archs, args, places) test_loader, test_avg_cost, test_acc_top1, test_acc_top5 = build_program( test_program, startup_program, image_shape, val_dataset, archs, args, place, is_test=True) test_program = test_program.clone(for_test=True) exe = static.Executor(place) exe.run(startup_program) build_strategy = static.BuildStrategy() train_compiled_program = static.CompiledProgram( train_program).with_data_parallel(loss_name=avg_cost.name, build_strategy=build_strategy) for epoch_id in range(args.retain_epoch): for batch_id, data in enumerate(train_loader()): fetches = [avg_cost.name] s_time = time.time() outs = exe.run(train_compiled_program, feed=data, fetch_list=fetches)[0] batch_time = time.time() - s_time if batch_id % 10 == 0: _logger.info( 'TRAIN: steps: {}, epoch: {}, batch: {}, cost: {}, batch_time: {}ms' .format(step, epoch_id, batch_id, outs[0], batch_time)) reward = [] for batch_id, data in enumerate(test_loader()): test_fetches = [ test_avg_cost.name, test_acc_top1.name, test_acc_top5.name ] batch_reward = exe.run(test_program, feed=data, fetch_list=test_fetches) reward_avg = np.mean(np.array(batch_reward), axis=1) reward.append(reward_avg) _logger.info( 'TEST: step: {}, batch: {}, avg_cost: {}, acc_top1: {}, acc_top5: {}' .format(step, batch_id, batch_reward[0], batch_reward[1], batch_reward[2])) finally_reward = np.mean(np.array(reward), axis=0) _logger.info( 'FINAL TEST: avg_cost: {}, acc_top1: {}, acc_top5: {}'.format( finally_reward[0], finally_reward[1], finally_reward[2])) obs = np.expand_dims(obs, axis=0).astype('float32') actions = rl_nas.tokens obs_next = [step + 1] obs_next.extend(actions[0]) obs_next = np.expand_dims(obs_next, axis=0).astype('float32') if step == args.search_steps - 1: terminal = np.expand_dims([True], axis=0).astype(np.bool) else: terminal = np.expand_dims([False], axis=0).astype(np.bool) rl_nas.reward(np.expand_dims(np.float32(finally_reward[1]), axis=0), obs=obs, actions=actions.astype('float32'), obs_next=obs_next, terminal=terminal) if step == 2: sys.exit(0)
def test_search_result(tokens, image_size, args, config): places = static.cuda_places() if args.use_gpu else static.cpu_places() place = places[0] sa_nas = SANAS(config, server_addr=(args.server_address, args.port), search_steps=args.search_steps, is_server=True) image_shape = [3, image_size, image_size] if args.data == 'cifar10': transform = T.Compose([T.Transpose(), T.Normalize([127.5], [127.5])]) train_dataset = paddle.vision.datasets.Cifar10(mode='train', transform=transform, backend='cv2') val_dataset = paddle.vision.datasets.Cifar10(mode='test', transform=transform, backend='cv2') elif args.data == 'imagenet': train_dataset = imagenet_reader.ImageNetDataset(mode='train') val_dataset = imagenet_reader.ImageNetDataset(mode='val') archs = sa_nas.tokens2arch(tokens)[0] train_program = static.Program() test_program = static.Program() startup_program = static.Program() train_loader, avg_cost, acc_top1, acc_top5 = build_program( train_program, startup_program, image_shape, train_dataset, archs, args, places) current_flops = flops(train_program) print('current_flops: {}'.format(current_flops)) test_loader, test_avg_cost, test_acc_top1, test_acc_top5 = build_program( test_program, startup_program, image_shape, val_dataset, archs, args, place, is_test=True) test_program = test_program.clone(for_test=True) exe = static.Executor(place) exe.run(startup_program) build_strategy = static.BuildStrategy() train_compiled_program = static.CompiledProgram( train_program).with_data_parallel(loss_name=avg_cost.name, build_strategy=build_strategy) for epoch_id in range(args.retain_epoch): for batch_id, data in enumerate(train_loader()): fetches = [avg_cost.name] s_time = time.time() outs = exe.run(train_compiled_program, feed=data, fetch_list=fetches)[0] batch_time = time.time() - s_time if batch_id % 10 == 0: _logger.info( 'TRAIN: epoch: {}, batch: {}, cost: {}, batch_time: {}ms'. format(epoch_id, batch_id, outs[0], batch_time)) reward = [] for batch_id, data in enumerate(test_loader()): test_fetches = [ test_avg_cost.name, test_acc_top1.name, test_acc_top5.name ] batch_reward = exe.run(test_program, feed=data, fetch_list=test_fetches) reward_avg = np.mean(np.array(batch_reward), axis=1) reward.append(reward_avg) _logger.info( 'TEST: batch: {}, avg_cost: {}, acc_top1: {}, acc_top5: {}'. format(batch_id, batch_reward[0], batch_reward[1], batch_reward[2])) finally_reward = np.mean(np.array(reward), axis=0) _logger.info( 'FINAL TEST: avg_cost: {}, acc_top1: {}, acc_top5: {}'.format( finally_reward[0], finally_reward[1], finally_reward[2]))
# failed by an exception. if not use_cuda: os.environ['CPU_NUM'] = str(2) places = static.cpu_places() else: places = static.cuda_places() data = static.data(name='X', shape=[None, 1], dtype='float32') hidden = static.nn.fc(input=data, size=10) loss = paddle.mean(hidden) paddle.optimizer.SGD(learning_rate=0.01).minimize(loss) exe.run(static.default_startup_program()) build_strategy = static.BuildStrategy() build_strategy.gradient_scale_strategy = \ static.BuildStrategy.GradientScaleStrategy.Customized compiled_prog = static.CompiledProgram( static.default_main_program()).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy, places=places) dev_count = len(places) x = numpy.random.random(size=(10, 1)).astype('float32') loss_grad = numpy.ones((dev_count)).astype("float32") * 0.01 loss_grad_name = loss.name + "@GRAD" loss_data = exe.run(compiled_prog, feed={ "X": x, loss_grad_name: loss_grad }, fetch_list=[loss.name, loss_grad_name])
def final_test(config, args, image_size, token=None): assert token != None, "If you want to start a final experiment, you must input a token." places = static.cuda_places() if args.use_gpu else static.cpu_places() place = places[0] sa_nas = SANAS(config, server_addr=(args.server_address, args.port), is_server=True) image_shape = [3, image_size, image_size] archs = sa_nas.tokens2arch(token)[0] train_program = static.Program() test_program = static.Program() startup_program = static.Program() train_fetch_list, (data, label), train_loader = build_program(train_program, startup_program, image_shape, archs, args, is_train=True) current_params = count_parameters_in_MB( train_program.global_block().all_parameters(), 'cifar10') _logger.info('current_params: {}M'.format(current_params)) test_fetch_list, _, test_loader = build_program(test_program, startup_program, image_shape, archs, args, is_train=False) test_program = test_program.clone(for_test=True) exe = static.Executor(place) exe.run(startup_program) train_reader = reader.train_valid(batch_size=args.batch_size, is_train=True, is_shuffle=True) test_reader = reader.train_valid(batch_size=args.batch_size, is_train=False, is_shuffle=False) train_loader.set_batch_generator(train_reader, places=place) test_loader.set_batch_generator(test_reader, places=place) build_strategy = static.BuildStrategy() train_compiled_program = static.CompiledProgram( train_program).with_data_parallel(loss_name=train_fetch_list[0].name, build_strategy=build_strategy) valid_top1_list = [] for epoch_id in range(args.retain_epoch): train_top1 = train(train_compiled_program, exe, epoch_id, train_loader, train_fetch_list, args) _logger.info("TRAIN: Epoch {}, train_acc {:.6f}".format( epoch_id, train_top1)) valid_top1 = valid(test_program, exe, epoch_id, test_loader, test_fetch_list, args) _logger.info("TEST: Epoch {}, valid_acc {:.6f}".format( epoch_id, valid_top1)) valid_top1_list.append(valid_top1) output_dir = os.path.join('darts_output', str(epoch_id)) if not os.path.exists(output_dir): os.makedirs(output_dir) static.save_inference_model(output_dir, [data], test_fetch_list, exe)
def search(config, args, image_size, is_server=True): places = static.cuda_places() if args.use_gpu else static.cpu_places() place = places[0] if is_server: ### start a server and a client sa_nas = SANAS(config, server_addr=(args.server_address, args.port), search_steps=args.search_steps, is_server=True) else: ### start a client sa_nas = SANAS(config, server_addr=(args.server_address, args.port), init_temperature=init_temperature, is_server=False) image_shape = [3, image_size, image_size] for step in range(args.search_steps): archs = sa_nas.next_archs()[0] train_program = static.Program() test_program = static.Program() startup_program = static.Program() train_fetch_list, _, train_loader = build_program(train_program, startup_program, image_shape, archs, args, is_train=True) current_params = count_parameters_in_MB( train_program.global_block().all_parameters(), 'cifar10') _logger.info('step: {}, current_params: {}M'.format( step, current_params)) if current_params > float(3.77): continue test_fetch_list, _, test_loader = build_program(test_program, startup_program, image_shape, archs, args, is_train=False) test_program = test_program.clone(for_test=True) exe = static.Executor(place) exe.run(startup_program) train_reader = reader.train_valid(batch_size=args.batch_size, is_train=True, is_shuffle=True) test_reader = reader.train_valid(batch_size=args.batch_size, is_train=False, is_shuffle=False) train_loader.set_batch_generator(train_reader, places=place) test_loader.set_batch_generator(test_reader, places=place) build_strategy = static.BuildStrategy() train_compiled_program = static.CompiledProgram( train_program).with_data_parallel( loss_name=train_fetch_list[0].name, build_strategy=build_strategy) valid_top1_list = [] for epoch_id in range(args.retain_epoch): train_top1 = train(train_compiled_program, exe, epoch_id, train_loader, train_fetch_list, args) _logger.info("TRAIN: step: {}, Epoch {}, train_acc {:.6f}".format( step, epoch_id, train_top1)) valid_top1 = valid(test_program, exe, epoch_id, test_loader, test_fetch_list, args) _logger.info("TEST: Epoch {}, valid_acc {:.6f}".format( epoch_id, valid_top1)) valid_top1_list.append(valid_top1) sa_nas.reward(float(valid_top1_list[-1] + valid_top1_list[-2]) / 2)
def search_mobilenetv2_block(config, args, image_size): image_shape = [3, image_size, image_size] transform = T.Compose([T.Transpose(), T.Normalize([127.5], [127.5])]) if args.data == 'cifar10': train_dataset = paddle.vision.datasets.Cifar10(mode='train', transform=transform, backend='cv2') val_dataset = paddle.vision.datasets.Cifar10(mode='test', transform=transform, backend='cv2') elif args.data == 'imagenet': train_dataset = imagenet_reader.ImageNetDataset(mode='train') val_dataset = imagenet_reader.ImageNetDataset(mode='val') places = static.cuda_places() if args.use_gpu else static.cpu_places() place = places[0] if args.is_server: sa_nas = SANAS(config, server_addr=(args.server_address, args.port), search_steps=args.search_steps, is_server=True) else: sa_nas = SANAS(config, server_addr=(args.server_address, args.port), search_steps=args.search_steps, is_server=False) for step in range(args.search_steps): archs = sa_nas.next_archs()[0] train_program = static.Program() test_program = static.Program() startup_program = static.Program() with static.program_guard(train_program, startup_program): data_shape = [None] + image_shape data = static.data(name='data', shape=data_shape, dtype='float32') label = static.data(name='label', shape=[None, 1], dtype='int64') if args.data == 'cifar10': paddle.assign(paddle.reshape(label, [-1, 1]), label) train_loader = paddle.io.DataLoader(train_dataset, places=places, feed_list=[data, label], drop_last=True, batch_size=args.batch_size, return_list=False, shuffle=True, use_shared_memory=True, num_workers=4) val_loader = paddle.io.DataLoader(val_dataset, places=place, feed_list=[data, label], drop_last=False, batch_size=args.batch_size, return_list=False, shuffle=False) data = conv_bn_layer(input=data, num_filters=32, filter_size=3, stride=2, padding='SAME', act='relu6', name='mobilenetv2_conv1') data = archs(data)[0] data = conv_bn_layer(input=data, num_filters=1280, filter_size=1, stride=1, padding='SAME', act='relu6', name='mobilenetv2_last_conv') data = F.adaptive_avg_pool2d(data, output_size=[1, 1], name='mobilenetv2_last_pool') output = static.nn.fc( x=data, size=args.class_dim, weight_attr=ParamAttr(name='mobilenetv2_fc_weights'), bias_attr=ParamAttr(name='mobilenetv2_fc_offset')) softmax_out = F.softmax(output) cost = F.cross_entropy(softmax_out, label=label) avg_cost = paddle.mean(cost) acc_top1 = paddle.metric.accuracy(input=softmax_out, label=label, k=1) acc_top5 = paddle.metric.accuracy(input=softmax_out, label=label, k=5) test_program = train_program.clone(for_test=True) optimizer = paddle.optimizer.Momentum( learning_rate=0.1, momentum=0.9, weight_decay=paddle.regularizer.L2Decay(1e-4)) optimizer.minimize(avg_cost) current_flops = flops(train_program) print('step: {}, current_flops: {}'.format(step, current_flops)) if current_flops > int(321208544): continue exe = static.Executor(place) exe.run(startup_program) build_strategy = static.BuildStrategy() train_compiled_program = static.CompiledProgram( train_program).with_data_parallel(loss_name=avg_cost.name, build_strategy=build_strategy) for epoch_id in range(args.retain_epoch): for batch_id, data in enumerate(train_loader()): fetches = [avg_cost.name] s_time = time.time() outs = exe.run(train_compiled_program, feed=data, fetch_list=fetches)[0] batch_time = time.time() - s_time if batch_id % 10 == 0: _logger.info( 'TRAIN: steps: {}, epoch: {}, batch: {}, cost: {}, batch_time: {}ms' .format(step, epoch_id, batch_id, outs[0], batch_time)) reward = [] for batch_id, data in enumerate(val_loader()): test_fetches = [avg_cost.name, acc_top1.name, acc_top5.name] batch_reward = exe.run(test_program, feed=data, fetch_list=test_fetches) reward_avg = np.mean(np.array(batch_reward), axis=1) reward.append(reward_avg) _logger.info( 'TEST: step: {}, batch: {}, avg_cost: {}, acc_top1: {}, acc_top5: {}' .format(step, batch_id, batch_reward[0], batch_reward[1], batch_reward[2])) finally_reward = np.mean(np.array(reward), axis=0) _logger.info( 'FINAL TEST: avg_cost: {}, acc_top1: {}, acc_top5: {}'.format( finally_reward[0], finally_reward[1], finally_reward[2])) sa_nas.reward(float(finally_reward[1]))