def __init__(self, resource_conf, down_rate): # maximun budget for single configuration, i.e., maximum iterations per configuration in example self.R = resource_conf # defines configuration downsampling rate (default = 3) self.eta = down_rate # control how many runs self.s_max = floor(log(self.R, self.eta)) # maximun budget for all configurations self.B = (self.s_max + 1) * self.R # list of results self.results = [] # parameters for results self.counter = 0 self.best_acc = np.NINF self.best_counter = -1 # parameters for workload self.hp_model_arch = cfg_para.hyperband_model_type_list self.hp_batch_size = cfg_para.hyperband_batch_size_list self.hp_opt = cfg_para.hyperband_optimizer_list self.hp_learn_rate = cfg_para.hyperband_learn_rate_list self.hp_activation = cfg_para.hyperband_activation_list self.hp_random_seed = cfg_para.hyperband_random_seed # training dataset self.hp_dataset = cfg_para.hyperband_train_dataset (self.img_width, self.img_height, self.num_channel, self.num_class) = load_dataset_para(self.hp_dataset)
def evaluate_pack_model(tf_sess, feature_ph, label_ph, pack_model): print("start to evaluate") hyperband_dataset = cfg_para.hyperband_train_dataset img_width, img_height, _, _ = load_dataset_para(hyperband_dataset) feature_input, label_input = load_eval_dataset(hyperband_dataset) acc_pack = list() if hyperband_dataset == 'imagenet': acc_sum = 0 imagenet_batch_size_eval = 50 num_batch_eval = label_input.shape[0] // imagenet_batch_size_eval test_image_list = sorted(os.listdir(feature_input)) for eval_op in pack_model: for n in range(num_batch_eval): batch_offset = n * imagenet_batch_size_eval batch_end = (n + 1) * imagenet_batch_size_eval eval_batch_list = test_image_list[batch_offset:batch_end] eval_feature_batch = load_imagenet_raw(feature_input, eval_batch_list, img_height, img_width) eval_label_batch = label_input[batch_offset:batch_end] acc_batch = tf_sess.run(eval_op, feed_dict={feature_ph: eval_feature_batch, label_ph: eval_label_batch}) acc_sum += acc_batch acc_avg = acc_sum / num_batch_eval acc_pack.append(acc_avg) else: for eval_op in pack_model: acc_avg = tf_sess.run(eval_op, feed_dict={feature_ph: feature_input, label_ph: label_input}) acc_pack.append(acc_avg) return acc_pack
def train_pack(): print('start training pack') rand_seed_pack = cfg_para.multi_rand_seed model_type_list = cfg_para.multi_model_type optimizer_list = cfg_para.multi_opt num_layer_list = cfg_para.multi_num_layer activation_list = cfg_para.multi_activation batch_size_list = cfg_para.multi_batch_size learning_rate_list = cfg_para.multi_learning_rate if len(set(batch_size_list)) == 1: is_batch_padding = False else: is_batch_padding = True num_epoch = cfg_para.multi_num_epoch train_dataset = cfg_para.multi_train_dataset use_tf_timeline = cfg_para.single_use_tb_timeline max_batch_size = max(batch_size_list) ################################################# # load dataset ################################################# img_width, img_height, num_channel, num_class = load_dataset_para( train_dataset) train_feature_input, train_label_input = load_train_dataset(train_dataset) ######################### # build packed model ######################### features = tf.placeholder(tf.float32, [None, img_width, img_height, num_channel]) labels = tf.placeholder(tf.int64, [None, num_class]) model_name_abbr = np.random.choice(rand_seed_pack, len(model_type_list), replace=False).tolist() train_op_pack = list() for midx, mt in enumerate(model_type_list): dm = ModelImporter(mt, str(model_name_abbr.pop()), num_layer_list[midx], img_height, img_width, num_channel, num_class, batch_size_list[midx], optimizer_list[midx], learning_rate_list[midx], activation_list[midx], batch_padding=is_batch_padding) model_entity = dm.get_model_entity() model_logit = model_entity.build(features, is_training=True) train_op = model_entity.train(model_logit, labels) train_op_pack.append(train_op) ######################### # train packed model ######################### config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True step_time = 0 step_count = 0 if train_dataset == 'imagenet': image_list = sorted(os.listdir(train_feature_input)) overall_time_start = timer() with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) num_batch = train_label_input.shape[0] // max_batch_size for e in range(num_epoch): for i in range(num_batch): print('epoch %d / %d, step %d / %d' % (e + 1, num_epoch, i + 1, num_batch)) if i != 0: start_time = timer() batch_offset = i * max_batch_size batch_end = (i + 1) * max_batch_size if train_dataset == 'imagenet': batch_list = image_list[batch_offset:batch_end] train_feature_batch = load_imagenet_raw( train_feature_input, batch_list, img_height, img_width) else: train_feature_batch = train_feature_input[ batch_offset:batch_end] train_label_batch = train_label_input[batch_offset:batch_end] if use_tf_timeline: profile_path = cfg_path.profile_path run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() sess.run(train_op_pack, feed_dict={ features: train_feature_batch, labels: train_label_batch }, options=run_options, run_metadata=run_metadata) trace = timeline.Timeline( step_stats=run_metadata.step_stats) trace_file = open( profile_path + '/' + '-'.join(map(str, set(model_type_list))) + '-' + str(len(model_type_list)) + '-'.join(map(str, set(batch_size_list))) + '-' + str(i) + '.json', 'w') trace_file.write( trace.generate_chrome_trace_format(show_dataflow=True, show_memory=True)) else: sess.run(train_op_pack, feed_dict={ features: train_feature_batch, labels: train_label_batch }) if i != 0: end_time = timer() dur_time = end_time - start_time print("step time:", dur_time) step_time += dur_time step_count += 1 overall_time_end = timer() overall_time = overall_time_end - overall_time_start print( f'overall training time (s):{overall_time}, average step time (ms):{step_time / step_count * 1000}' )
def train_sequential(): print('start training sequential') rand_seed = cfg_para.multi_rand_seed model_type_list = cfg_para.multi_model_type optimizer_list = cfg_para.multi_opt num_layer_list = cfg_para.multi_num_layer activation_list = cfg_para.multi_activation batch_size_list = cfg_para.multi_batch_size learning_rate_list = cfg_para.multi_learning_rate train_dataset = cfg_para.multi_train_dataset ########################################## # load dataset parameters ########################################## img_width, img_height, num_channel, num_class = load_dataset_para( train_dataset) ########################################## # build models ########################################## names = globals() for idx in range(len(model_type_list)): names['features' + str(idx)] = tf.placeholder( tf.float32, [None, img_width, img_height, num_channel]) names['labels' + str(idx)] = tf.placeholder(tf.int64, [None, num_class]) train_op_list = list() model_name_abbr = np.random.choice(rand_seed, len(model_type_list), replace=False).tolist() for midx, mvalue in enumerate(model_type_list): dm = ModelImporter(mvalue, str(model_name_abbr.pop()), num_layer_list[midx], img_width, img_height, num_channel, num_class, batch_size_list[midx], optimizer_list[midx], learning_rate_list[midx], activation_list[midx], batch_padding=False) model_entity = dm.get_model_entity() model_logit = model_entity.build(names['features' + str(midx)], is_training=True) train_op = model_entity.train(model_logit, names['labels' + str(midx)]) train_op_list.append(train_op) ######################### # train models ######################### start_time = timer() for tidx, tm in enumerate(train_op_list): p = Process(target=train_model, args=(tm, batch_size_list[tidx], model_type_list[tidx], tidx, names)) p.start() p.join() end_time = timer() dur_time = end_time - start_time print(f'total training time(s): {dur_time}')
def train_model(train_step_arg, batch_size_arg, model_type_arg, tidx_arg, global_args): train_dataset = cfg_para.multi_train_dataset num_epoch = cfg_para.multi_num_epoch use_tf_timeline = cfg_para.multi_use_tb_timeline use_cpu = cfg_para.multi_use_cpu if use_cpu: train_device = '/cpu:0' else: train_device = '/gpu:0' img_width, img_height, num_channel, num_class = load_dataset_para( train_dataset) train_feature_input, train_label_input = load_train_dataset(train_dataset) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True if train_dataset == 'imagenet': image_list = sorted(os.listdir(train_feature_input)) with tf.device(train_device): with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) num_batch = train_label_input.shape[0] // batch_size_arg for e in range(num_epoch): for i in range(num_batch): print('epoch %d / %d, step %d / %d' % (e + 1, num_epoch, i + 1, num_batch)) batch_offset = i * batch_size_arg batch_end = (i + 1) * batch_size_arg if train_dataset == 'imagenet': batch_list = image_list[batch_offset:batch_end] feature_batch = load_imagenet_raw( train_feature_input, batch_list, img_height, img_width) else: feature_batch = train_feature_input[ batch_offset:batch_end] label_batch = train_label_input[batch_offset:batch_end] if use_tf_timeline: profile_path = cfg_path.profile_path run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() sess.run(train_step_arg, feed_dict={ global_args['features' + str(tidx_arg)]: feature_batch, global_args['labels' + str(tidx_arg)]: label_batch }, options=run_options, run_metadata=run_metadata) trace = timeline.Timeline( step_stats=run_metadata.step_stats) trace_file = open( profile_path + '/' + str(model_type_arg) + '-' + str(batch_size_arg) + '-' + str(i) + '.json', 'w') trace_file.write( trace.generate_chrome_trace_format( show_dataflow=True, show_memory=True)) else: sess.run(train_step_arg, feed_dict={ global_args['features' + str(tidx_arg)]: feature_batch, global_args['labels' + str(tidx_arg)]: label_batch })
def train_single(): print('start training single') rand_seed = cfg_para.single_rand_seed num_epoch = cfg_para.single_num_epoch model_type = cfg_para.single_model_type num_layer = cfg_para.single_num_layer learning_rate = cfg_para.single_learning_rate activation = cfg_para.single_activation batch_size = cfg_para.single_batch_size optimizer = cfg_para.single_opt train_dataset = cfg_para.single_train_dataset use_tf_timeline = cfg_para.single_use_tb_timeline use_cpu = cfg_para.single_use_cpu if use_cpu: train_device = '/cpu:0' else: train_device = '/gpu:0' ########################################## # load dataset ########################################## img_width, img_height, num_channel, num_class = load_dataset_para( train_dataset) train_feature_input, train_label_input = load_train_dataset(train_dataset) eval_feature_input, eval_label_input = load_eval_dataset(train_dataset) ########################################## # build model ########################################## feature_ph = tf.placeholder(tf.float32, [None, img_width, img_height, num_channel]) label_ph = tf.placeholder(tf.int64, [None, num_class]) model_name_abbr = np.random.choice(rand_seed, 1, replace=False).tolist() dm = ModelImporter(model_type, str(model_name_abbr.pop()), num_layer, img_height, img_width, num_channel, num_class, batch_size, optimizer, learning_rate, activation, batch_padding=False) model_entity = dm.get_model_entity() model_logit = model_entity.build(feature_ph, is_training=True) train_op = model_entity.train(model_logit, label_ph) eval_op = model_entity.evaluate(model_logit, label_ph) ########################################## # train model ########################################## step_time = 0 step_count = 0 config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True if train_dataset == 'imagenet': image_list = sorted(os.listdir(train_feature_input)) overall_time_start = timer() with tf.device(train_device): with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) num_batch = train_label_input.shape[0] // batch_size for e in range(num_epoch): for i in range(num_batch): print('epoch %d / %d, step %d / %d' % (e + 1, num_epoch, i + 1, num_batch)) if i != 0: start_time = timer() batch_offset = i * batch_size batch_end = (i + 1) * batch_size if train_dataset == 'imagenet': batch_list = image_list[batch_offset:batch_end] train_feature_batch = load_imagenet_raw( train_feature_input, batch_list, img_height, img_width) else: train_feature_batch = train_feature_input[ batch_offset:batch_end] train_label_batch = train_label_input[ batch_offset:batch_end] if use_tf_timeline: profile_path = cfg_path.profile_path run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() sess.run(train_op, feed_dict={ feature_ph: train_feature_batch, label_ph: train_label_batch }, options=run_options, run_metadata=run_metadata) trace = timeline.Timeline( step_stats=run_metadata.step_stats) trace_file = open( profile_path + '/' + str(model_type) + '-' + str(batch_size) + '-' + str(i) + '.json', 'w') trace_file.write( trace.generate_chrome_trace_format( show_dataflow=True, show_memory=True)) else: sess.run(train_op, feed_dict={ feature_ph: train_feature_batch, label_ph: train_label_batch }) if i != 0: end_time = timer() dur_time = end_time - start_time print("step time:", dur_time) step_time += dur_time step_count += 1 acc_avg = sess.run(eval_op, feed_dict={ feature_ph: eval_feature_input, label_ph: eval_label_input }) print('evaluation accuracy:{}'.format(acc_avg)) overall_time_end = timer() overall_time = overall_time_end - overall_time_start print( f'overall training time (s):{overall_time}, average step time (ms):{step_time / step_count * 1000}' )
def train_model(job_id): model_type_list = cfg_para.multi_model_type num_layer_list = cfg_para.multi_num_layer activation_list = cfg_para.multi_activation batch_size_list = cfg_para.multi_batch_size learning_rate_list = cfg_para.multi_learning_rate optimizer_list = cfg_para.multi_opt model_type = model_type_list[job_id] num_layer = num_layer_list[job_id] activation = activation_list[job_id] batch_size = batch_size_list[job_id] learning_rate = learning_rate_list[job_id] optimizer = optimizer_list[job_id] num_epoch = cfg_para.multi_num_epoch train_dataset = cfg_para.multi_train_dataset use_tf_timeline = cfg_para.multi_use_tb_timeline use_cpu = cfg_para.multi_use_cpu if use_cpu: train_device = '/cpu:0' else: train_device = '/gpu:0' model_name = '{0}-{1}-{2}-{3}-{4}-{5}-{6}-{7}'.format( job_id, model_type, num_layer, batch_size, learning_rate, optimizer, num_epoch, train_dataset) ########################################## # load dataset ########################################## img_width, img_height, num_channel, num_class = load_dataset_para( train_dataset) train_feature_input, train_label_input = load_train_dataset(train_dataset) ########################################## # build model ########################################## features = tf.placeholder(tf.float32, [None, img_width, img_height, num_channel]) labels = tf.placeholder(tf.int64, [None, num_class]) dm = ModelImporter(model_type, str(job_id), num_layer, img_height, img_width, num_channel, num_class, batch_size, optimizer, learning_rate, activation, batch_padding=False) model_entity = dm.get_model_entity() model_logit = model_entity.build(features, is_training=True) train_op = model_entity.train(model_logit, labels) ########################################## # train model ########################################## step_time = 0 step_count = 0 config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True if train_dataset == 'imagenet': image_list = sorted(os.listdir(train_feature_input)) with tf.device(train_device): with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) num_batch = train_label_input.shape[0] // batch_size for e in range(num_epoch): for i in range(num_batch): print('epoch %d / %d, step %d / %d' % (e + 1, num_epoch, i + 1, num_batch)) if i != 0: start_time = timer() batch_offset = i * batch_size batch_end = (i + 1) * batch_size if train_dataset == 'imagenet': batch_list = image_list[batch_offset:batch_end] train_feature_batch = load_imagenet_raw( train_feature_input, batch_list, img_height, img_width) else: train_feature_batch = train_feature_input[ batch_offset:batch_end] train_label_batch = train_label_input[ batch_offset:batch_end] if use_tf_timeline: profile_path = cfg_path.profile_path run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() sess.run(train_op, feed_dict={ features: train_feature_batch, labels: train_label_batch }, options=run_options, run_metadata=run_metadata) trace = timeline.Timeline( step_stats=run_metadata.step_stats) trace_file = open( profile_path + '/' + str(model_type) + '-' + str(batch_size) + '-' + str(i) + '.json', 'w') trace_file.write( trace.generate_chrome_trace_format( show_dataflow=True, show_memory=True)) else: sess.run(train_op, feed_dict={ features: train_feature_batch, labels: train_label_batch }) if i != 0: end_time = timer() dur_time = end_time - start_time print("step time:", dur_time) step_time += dur_time step_count += 1 step_time_result = f'average step time (ms) of {model_name}: {step_time / step_count * 1000}' return step_time_result