Beispiel #1
0
def evaluate_pack_model(tf_sess, feature_ph, label_ph, pack_model):
    print("start to evaluate")
    hyperband_dataset = cfg_para.hyperband_train_dataset
    img_width, img_height, _, _ = load_dataset_para(hyperband_dataset)
    feature_input, label_input = load_eval_dataset(hyperband_dataset)

    acc_pack = list()

    if hyperband_dataset == 'imagenet':
        acc_sum = 0
        imagenet_batch_size_eval = 50
        num_batch_eval = label_input.shape[0] // imagenet_batch_size_eval
        test_image_list = sorted(os.listdir(feature_input))
        for eval_op in pack_model:
            for n in range(num_batch_eval):
                batch_offset = n * imagenet_batch_size_eval
                batch_end = (n + 1) * imagenet_batch_size_eval
                eval_batch_list = test_image_list[batch_offset:batch_end]
                eval_feature_batch = load_imagenet_raw(feature_input, eval_batch_list, img_height, img_width)
                eval_label_batch = label_input[batch_offset:batch_end]
                acc_batch = tf_sess.run(eval_op, feed_dict={feature_ph: eval_feature_batch,
                                                            label_ph: eval_label_batch})
                acc_sum += acc_batch
            acc_avg = acc_sum / num_batch_eval
            acc_pack.append(acc_avg)
    else:
        for eval_op in pack_model:
            acc_avg = tf_sess.run(eval_op, feed_dict={feature_ph: feature_input, label_ph: label_input})
            acc_pack.append(acc_avg)

    return acc_pack
Beispiel #2
0
def train_pack():
    print('start training pack')

    rand_seed_pack = cfg_para.multi_rand_seed

    model_type_list = cfg_para.multi_model_type
    optimizer_list = cfg_para.multi_opt
    num_layer_list = cfg_para.multi_num_layer
    activation_list = cfg_para.multi_activation
    batch_size_list = cfg_para.multi_batch_size
    learning_rate_list = cfg_para.multi_learning_rate

    if len(set(batch_size_list)) == 1:
        is_batch_padding = False
    else:
        is_batch_padding = True

    num_epoch = cfg_para.multi_num_epoch
    train_dataset = cfg_para.multi_train_dataset
    use_tf_timeline = cfg_para.single_use_tb_timeline

    max_batch_size = max(batch_size_list)

    #################################################
    # load dataset
    #################################################

    img_width, img_height, num_channel, num_class = load_dataset_para(
        train_dataset)
    train_feature_input, train_label_input = load_train_dataset(train_dataset)

    #########################
    # build packed model
    #########################

    features = tf.placeholder(tf.float32,
                              [None, img_width, img_height, num_channel])
    labels = tf.placeholder(tf.int64, [None, num_class])

    model_name_abbr = np.random.choice(rand_seed_pack,
                                       len(model_type_list),
                                       replace=False).tolist()
    train_op_pack = list()

    for midx, mt in enumerate(model_type_list):
        dm = ModelImporter(mt,
                           str(model_name_abbr.pop()),
                           num_layer_list[midx],
                           img_height,
                           img_width,
                           num_channel,
                           num_class,
                           batch_size_list[midx],
                           optimizer_list[midx],
                           learning_rate_list[midx],
                           activation_list[midx],
                           batch_padding=is_batch_padding)

        model_entity = dm.get_model_entity()
        model_logit = model_entity.build(features, is_training=True)
        train_op = model_entity.train(model_logit, labels)
        train_op_pack.append(train_op)

    #########################
    # train packed model
    #########################

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.allow_soft_placement = True

    step_time = 0
    step_count = 0

    if train_dataset == 'imagenet':
        image_list = sorted(os.listdir(train_feature_input))

    overall_time_start = timer()

    with tf.Session(config=config) as sess:
        sess.run(tf.global_variables_initializer())
        num_batch = train_label_input.shape[0] // max_batch_size

        for e in range(num_epoch):
            for i in range(num_batch):
                print('epoch %d / %d, step %d / %d' %
                      (e + 1, num_epoch, i + 1, num_batch))

                if i != 0:
                    start_time = timer()

                batch_offset = i * max_batch_size
                batch_end = (i + 1) * max_batch_size
                if train_dataset == 'imagenet':
                    batch_list = image_list[batch_offset:batch_end]
                    train_feature_batch = load_imagenet_raw(
                        train_feature_input, batch_list, img_height, img_width)
                else:
                    train_feature_batch = train_feature_input[
                        batch_offset:batch_end]

                train_label_batch = train_label_input[batch_offset:batch_end]

                if use_tf_timeline:
                    profile_path = cfg_path.profile_path
                    run_options = tf.RunOptions(
                        trace_level=tf.RunOptions.FULL_TRACE)
                    run_metadata = tf.RunMetadata()

                    sess.run(train_op_pack,
                             feed_dict={
                                 features: train_feature_batch,
                                 labels: train_label_batch
                             },
                             options=run_options,
                             run_metadata=run_metadata)

                    trace = timeline.Timeline(
                        step_stats=run_metadata.step_stats)
                    trace_file = open(
                        profile_path + '/' +
                        '-'.join(map(str, set(model_type_list))) + '-' +
                        str(len(model_type_list)) +
                        '-'.join(map(str, set(batch_size_list))) + '-' +
                        str(i) + '.json', 'w')
                    trace_file.write(
                        trace.generate_chrome_trace_format(show_dataflow=True,
                                                           show_memory=True))
                else:
                    sess.run(train_op_pack,
                             feed_dict={
                                 features: train_feature_batch,
                                 labels: train_label_batch
                             })

                if i != 0:
                    end_time = timer()
                    dur_time = end_time - start_time
                    print("step time:", dur_time)
                    step_time += dur_time
                    step_count += 1

    overall_time_end = timer()
    overall_time = overall_time_end - overall_time_start
    print(
        f'overall training time (s):{overall_time}, average step time (ms):{step_time / step_count * 1000}'
    )
Beispiel #3
0
    def hyperband_original(self, hyper_params, epochs):
        train_feature_input, train_label_input = load_train_dataset(self.hp_dataset)
        eval_feature_input, eval_label_input = load_eval_dataset(self.hp_dataset)

        graph = tf.Graph()
        with graph.as_default():
            features = tf.placeholder(tf.float32, [None, self.img_width, self.img_height, self.num_channel])
            labels = tf.placeholder(tf.int64, [None, self.num_class])

            dt = datetime.now()
            np.random.seed(dt.microsecond)
            net_instnace = np.random.randint(sys.maxsize)

            model_arch = hyper_params[0]
            model_type = model_arch.split('-')[0]
            model_layer = int(model_arch.split('-')[1])
            batch_size = hyper_params[1]
            opt = hyper_params[2]
            learning_rate = hyper_params[3]
            activation = hyper_params[4]

            print("\n** model: {} | batch size: {} | opt: {} | model layer: {} | learn rate: {} | act: {} **"
                  .format(model_type, batch_size, opt, model_layer, learning_rate, activation))

            dm = ModelImporter(model_type,
                               str(net_instnace),
                               model_layer,
                               self.img_height,
                               self.img_width,
                               self.num_channel,
                               self.num_class,
                               batch_size,
                               opt,
                               learning_rate,
                               activation,
                               batch_padding=False)
            model_entity = dm.get_model_entity()
            model_logit = model_entity.build(features, is_training=True)
            train_op = model_entity.train(model_logit, labels)
            eval_op = model_entity.evaluate(model_logit, labels)

        if self.hp_dataset == 'imagenet':
            image_list = sorted(os.listdir(train_feature_input))

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        config.allow_soft_placement = True

        with tf.Session(graph=graph, config=config) as sess:
            sess.run(tf.global_variables_initializer())
            num_batch = train_label_input.shape[0] // batch_size
            for e in range(epochs):
                for i in range(num_batch):
                    # print('epoch %d / %d, step %d / %d' %(e+1, epochs, i+1, num_batch))
                    batch_offset = i * batch_size
                    batch_end = (i + 1) * batch_size

                    if self.hp_dataset == 'imagenet':
                        batch_list = image_list[batch_offset:batch_end]
                        train_feature_batch = load_imagenet_raw(self.hp_dataset,
                                                                batch_list,
                                                                self.img_height,
                                                                self.img_width)
                    else:
                        train_feature_batch = train_feature_input[batch_offset:batch_end]

                    train_label_batch = train_label_input[batch_offset:batch_end]

                    sess.run(train_op, feed_dict={features: train_feature_batch, labels: train_label_batch})

            if self.hp_dataset == 'imagenet':
                acc_sum = 0
                imagenet_batch_size_eval = 50
                num_batch_eval = eval_label_input.shape[0] // imagenet_batch_size_eval
                test_image_list = sorted(os.listdir(eval_feature_input))
                for n in range(num_batch_eval):
                    batch_offset = n * imagenet_batch_size_eval
                    batch_end = (n + 1) * imagenet_batch_size_eval
                    test_batch_list = test_image_list[batch_offset:batch_end]
                    test_feature_batch = load_imagenet_raw(eval_feature_input,
                                                           test_batch_list,
                                                           self.img_height,
                                                           self.img_width)
                    test_label_batch = eval_label_input[batch_offset:batch_end]
                    acc_batch = sess.run(eval_op, feed_dict={features: test_feature_batch, labels: test_label_batch})
                    acc_sum += acc_batch
                acc_avg = acc_sum / num_batch_eval
            else:
                acc_avg = sess.run(eval_op, feed_dict={features: eval_feature_input, labels: eval_label_input})

        print(f'Accuracy: {acc_avg}')
        return acc_avg
Beispiel #4
0
    def hyperband_pack_knn(self, confs, epochs):
        train_feature_input, train_label_input = load_train_dataset(self.hp_dataset)

        features = tf.placeholder(tf.float32, [None, self.img_width, self.img_height, self.num_channel])
        labels = tf.placeholder(tf.int64, [None, self.num_class])

        dt = datetime.now()
        np.random.seed(dt.microsecond)
        net_instnace = np.random.randint(sys.maxsize, size=len(confs))

        desire_epochs = epochs

        entity_pack = list()
        train_pack = list()
        eval_pack = list()
        batch_size_set = set()

        for cidx, cf in enumerate(confs):
            model_arch = cf[0]
            model_type = model_arch.split('-')[0]
            model_layer = int(model_arch.split('-')[1])
            batch_size = cf[1]
            batch_size_set.add(batch_size)
            opt = cf[2]
            learning_rate = cf[3]
            activation = cf[4]

            desire_steps = train_label_input.shape[0] // batch_size

            dm = ModelImporter(model_type,
                               str(net_instnace[cidx]),
                               model_layer,
                               self.img_height,
                               self.img_width,
                               self.num_channel,
                               self.num_class,
                               batch_size, opt,
                               learning_rate,
                               activation,
                               batch_padding=True)

            model_entity = dm.get_model_entity()
            model_entity.set_desire_epochs(desire_epochs)
            model_entity.set_desire_steps(desire_steps)
            model_logit = model_entity.build(features, is_training=True)
            train_op = model_entity.train(model_logit, labels)
            eval_op = model_entity.evaluate(model_logit, labels)
            entity_pack.append(model_entity)
            train_pack.append(train_op)
            eval_pack.append(eval_op)

        if self.hp_dataset == 'imagenet':
            image_list = sorted(os.listdir(train_feature_input))

        config = tf.ConfigProto()
        config.allow_soft_placement = True
        config.gpu_options.allow_growth = True

        with tf.Session(config=config) as sess:
            sess.run(tf.global_variables_initializer())
            max_bs = max(batch_size_set)

            complete_flag = False

            while len(train_pack) != 0:
                num_steps = train_label_input.shape[0] // max_bs
                for i in range(num_steps):
                    print('step %d / %d' % (i + 1, num_steps))
                    batch_offset = i * max_bs
                    batch_end = (i + 1) * max_bs

                    if self.hp_dataset == 'imagenet':
                        batch_list = image_list[batch_offset:batch_end]
                        train_feature_batch = load_imagenet_raw(train_feature_input,
                                                                batch_list,
                                                                self.img_height,
                                                                self.img_width)
                    else:
                        train_feature_batch = train_feature_input[batch_offset:batch_end]

                    train_label_batch = train_label_input[batch_offset:batch_end]

                    sess.run(train_pack, feed_dict={features: train_feature_batch, labels: train_label_batch})
                    for me in entity_pack:
                        me.set_current_step()
                        if me.is_complete_train():
                            print("model has been trained completely:{}".format(me.get_model_instance_name()))
                            sess.run(me.set_batch_size(train_label_input.shape[0]))
                            train_pack.remove(me.get_train_op())
                            complete_flag = True

                    if len(train_pack) == 0:
                        break

                    if complete_flag:
                        batch_size_set.discard(max_bs)
                        max_bs = max(batch_size_set)
                        complete_flag = False
                        break

            acc_pack = evaluate_pack_model(sess, features, labels, eval_pack)

        print(f'Accuracy: {acc_pack}')
        return acc_pack
Beispiel #5
0
    def hyperband_pack_bs(self, batch_size, confs, epochs):
        train_feature_input, train_label_input = load_train_dataset(self.hp_dataset)

        features = tf.placeholder(tf.float32, [None, self.img_width, self.img_height, self.num_channel])
        labels = tf.placeholder(tf.int64, [None, self.num_class])

        dt = datetime.now()
        np.random.seed(dt.microsecond)
        net_instnace = np.random.randint(sys.maxsize, size=len(confs))

        train_pack = list()
        eval_pack = list()

        for cidx, civ in enumerate(confs):
            model_arch = civ[0]
            model_type = model_arch.split('-')[0]
            model_layer = int(model_arch.split('-')[1])
            opt = civ[2]
            learning_rate = civ[3]
            activation = civ[4]

            dm = ModelImporter(model_type,
                               str(net_instnace[cidx]),
                               model_layer,
                               self.img_height,
                               self.img_width,
                               self.num_channel,
                               self.num_class,
                               batch_size,
                               opt,
                               learning_rate,
                               activation,
                               batch_padding=False)

            model_entity = dm.get_model_entity()
            model_logit = model_entity.build(features, is_training=True)
            train_op = model_entity.train(model_logit, labels)
            eval_op = model_entity.evaluate(model_logit, labels)
            train_pack.append(train_op)
            eval_pack.append(eval_op)

        if self.hp_dataset == 'imagenet':
            image_list = sorted(os.listdir(train_feature_input))

        config = tf.ConfigProto()
        config.allow_soft_placement = True
        config.gpu_options.allow_growth = True
        with tf.Session(config=config) as sess:
            sess.run(tf.global_variables_initializer())
            num_batch = train_label_input.shape[0] // batch_size
            for e in range(epochs):
                for i in range(num_batch):
                    # print('epoch %d / %d, step %d / %d' %(e+1, epochs, i+1, num_batch))
                    batch_offset = i * batch_size
                    batch_end = (i + 1) * batch_size

                    if self.hp_dataset == 'imagenet':
                        batch_list = image_list[batch_offset:batch_end]
                        train_feature_batch = load_imagenet_raw(train_feature_input,
                                                                batch_list,
                                                                self.img_height,
                                                                self.img_width)
                    else:
                        train_feature_batch = train_feature_input[batch_offset:batch_end]

                    train_label_batch = train_label_input[batch_offset:batch_end]

                    sess.run(train_pack, feed_dict={features: train_feature_batch, labels: train_label_batch})

            acc_pack = evaluate_pack_model(sess, features, labels, eval_pack)

        print(f'Accuracy: {acc_pack}')
        return acc_pack
Beispiel #6
0
def train_model(train_step_arg, batch_size_arg, model_type_arg, tidx_arg,
                global_args):

    train_dataset = cfg_para.multi_train_dataset
    num_epoch = cfg_para.multi_num_epoch
    use_tf_timeline = cfg_para.multi_use_tb_timeline
    use_cpu = cfg_para.multi_use_cpu

    if use_cpu:
        train_device = '/cpu:0'
    else:
        train_device = '/gpu:0'

    img_width, img_height, num_channel, num_class = load_dataset_para(
        train_dataset)
    train_feature_input, train_label_input = load_train_dataset(train_dataset)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.allow_soft_placement = True

    if train_dataset == 'imagenet':
        image_list = sorted(os.listdir(train_feature_input))

    with tf.device(train_device):
        with tf.Session(config=config) as sess:
            sess.run(tf.global_variables_initializer())
            num_batch = train_label_input.shape[0] // batch_size_arg

            for e in range(num_epoch):
                for i in range(num_batch):
                    print('epoch %d / %d, step %d / %d' %
                          (e + 1, num_epoch, i + 1, num_batch))

                    batch_offset = i * batch_size_arg
                    batch_end = (i + 1) * batch_size_arg
                    if train_dataset == 'imagenet':
                        batch_list = image_list[batch_offset:batch_end]
                        feature_batch = load_imagenet_raw(
                            train_feature_input, batch_list, img_height,
                            img_width)
                    else:
                        feature_batch = train_feature_input[
                            batch_offset:batch_end]

                    label_batch = train_label_input[batch_offset:batch_end]

                    if use_tf_timeline:
                        profile_path = cfg_path.profile_path
                        run_options = tf.RunOptions(
                            trace_level=tf.RunOptions.FULL_TRACE)
                        run_metadata = tf.RunMetadata()
                        sess.run(train_step_arg,
                                 feed_dict={
                                     global_args['features' + str(tidx_arg)]:
                                     feature_batch,
                                     global_args['labels' + str(tidx_arg)]:
                                     label_batch
                                 },
                                 options=run_options,
                                 run_metadata=run_metadata)
                        trace = timeline.Timeline(
                            step_stats=run_metadata.step_stats)
                        trace_file = open(
                            profile_path + '/' + str(model_type_arg) + '-' +
                            str(batch_size_arg) + '-' + str(i) + '.json', 'w')
                        trace_file.write(
                            trace.generate_chrome_trace_format(
                                show_dataflow=True, show_memory=True))
                    else:
                        sess.run(train_step_arg,
                                 feed_dict={
                                     global_args['features' + str(tidx_arg)]:
                                     feature_batch,
                                     global_args['labels' + str(tidx_arg)]:
                                     label_batch
                                 })
Beispiel #7
0
def profile_single_model(job):
    job_model_arch = job[0]
    job_model_type = job_model_arch.split('-')[0]
    job_num_layer = int(job_model_arch.split('-')[1])
    job_batch_size = job[1]
    job_opt = job[2]
    job_activation = job[3]
    job_learn_rate = job[4]

    dt = datetime.now()
    np.random.seed(dt.microsecond)
    net_instnace = np.random.randint(sys.maxsize)

    model_name = '{0}-{1}-{2}-{3}-{4}-{5}-{6}-{7}'.format(
        net_instnace, job_model_type, job_num_layer, job_batch_size,
        job_learn_rate, job_opt, job_activation, train_dataset)

    features = tf.placeholder(tf.float32,
                              [None, img_width, img_height, num_channel])
    labels = tf.placeholder(tf.int64, [None, num_class])

    dm = ModelImporter(job_model_type,
                       str(net_instnace),
                       job_num_layer,
                       img_height,
                       img_width,
                       num_channel,
                       num_class,
                       job_batch_size,
                       job_opt,
                       job_learn_rate,
                       job_activation,
                       batch_padding=True)

    model_entity = dm.get_model_entity()
    model_logit = model_entity.build(features, is_training=True)
    train_step = model_entity.train(model_logit, labels)

    step_time = 0
    step_count = 0

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.allow_soft_placement = True

    if train_dataset == 'imagenet':
        image_list = sorted(os.listdir(train_img_path))

    with tf.Session(config=config) as sess:
        sess.run(tf.global_variables_initializer())
        num_batch = train_label.shape[0] // job_batch_size
        for i in range(num_batch):
            print('step %d / %d' % (i + 1, num_batch))

            if i != 0:
                start_time = timer()

            batch_offset = i * job_batch_size
            batch_end = (i + 1) * job_batch_size

            if train_dataset == 'imagenet':
                batch_list = image_list[batch_offset:batch_end]
                train_feature_batch = load_imagenet_raw(
                    train_img_path, batch_list, img_height, img_width)
            else:
                train_feature_batch = train_feature[batch_offset:batch_end]

            train_label_batch = train_label[batch_offset:batch_end]

            sess.run(train_step,
                     feed_dict={
                         features: train_feature_batch,
                         labels: train_label_batch
                     })

            if i != 0:
                end_time = timer()
                dur_time = end_time - start_time
                print("step time:", dur_time)
                step_time += dur_time
                step_count += 1

    avg_step_time = step_time / step_count * 1000
    print('Job {}: {}'.format(model_name, avg_step_time))
Beispiel #8
0
def profile_pack_model(job_a, job_b):
    job_model_arch_a = job_a[0]
    job_model_type_a = job_model_arch_a.split('-')[0]
    job_num_layer_a = int(job_model_arch_a.split('-')[1])
    job_batch_size_a = job_a[1]
    job_opt_a = job_a[2]
    job_activation_a = job_a[3]
    job_learn_rate_a = job_a[4]

    model_name_a = '{0}-{1}-{2}-{3}-{4}-{5}-{6}'.format(
        job_model_type_a, job_num_layer_a, job_batch_size_a, job_learn_rate_a,
        job_opt_a, job_activation_a, train_dataset)

    job_model_arch_b = job_b[0]
    job_model_type_b = job_model_arch_b.split('-')[0]
    job_num_layer_b = int(job_model_arch_b.split('-')[1])
    job_batch_size_b = job_b[1]
    job_opt_b = job_b[2]
    job_activation_b = job_b[3]
    job_learn_rate_b = job_b[4]

    model_name_b = '{0}-{1}-{2}-{3}-{4}-{5}-{6}'.format(
        job_model_type_b, job_num_layer_b, job_batch_size_b, job_learn_rate_b,
        job_opt_b, job_activation_b, train_dataset)

    max_batch_size = max(job_batch_size_a, job_batch_size_b)

    dt = datetime.now()
    np.random.seed(dt.microsecond)
    net_instnace = np.random.randint(sys.maxsize, size=2)

    features = tf.placeholder(tf.float32,
                              [None, img_width, img_height, num_channel])
    labels = tf.placeholder(tf.int64, [None, num_class])

    dm_a = ModelImporter(job_model_type_a,
                         str(net_instnace[0]),
                         job_num_layer_a,
                         img_height,
                         img_width,
                         num_channel,
                         num_class,
                         job_batch_size_a,
                         job_opt_a,
                         job_learn_rate_a,
                         job_activation_a,
                         batch_padding=True)
    model_entity_a = dm_a.get_model_entity()
    model_logit_a = model_entity_a.build(features, is_training=True)
    train_step_a = model_entity_a.train(model_logit_a, labels)

    dm_b = ModelImporter(job_model_type_b,
                         str(net_instnace[1]),
                         job_num_layer_b,
                         img_height,
                         img_width,
                         num_channel,
                         num_class,
                         job_batch_size_b,
                         job_opt_b,
                         job_learn_rate_b,
                         job_activation_b,
                         batch_padding=True)
    model_entity_b = dm_b.get_model_entity()
    model_logit_b = model_entity_b.build(features, is_training=True)
    train_step_b = model_entity_b.train(model_logit_b, labels)

    step_time = 0
    step_count = 0

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.allow_soft_placement = True

    if train_dataset == 'imagenet':
        image_list = sorted(os.listdir(train_img_path))

    with tf.Session(config=config) as sess:
        sess.run(tf.global_variables_initializer())
        num_batch = train_label.shape[0] // max_batch_size
        for i in range(num_batch):
            print('step %d / %d' % (i + 1, num_batch))

            if i != 0:
                start_time = timer()

            batch_offset = i * max_batch_size
            batch_end = (i + 1) * max_batch_size

            if train_dataset == 'imagenet':
                batch_list = image_list[batch_offset:batch_end]
                train_feature_batch = load_imagenet_raw(
                    train_img_path, batch_list, img_height, img_width)
            else:
                train_feature_batch = train_feature[batch_offset:batch_end]

            train_label_batch = train_label[batch_offset:batch_end]

            sess.run([train_step_a, train_step_b],
                     feed_dict={
                         features: train_feature_batch,
                         labels: train_label_batch
                     })

            if i != 0:
                end_time = timer()
                dur_time = end_time - start_time
                print("step time:", dur_time)
                step_time += dur_time
                step_count += 1

    avg_step_time = step_time / step_count * 1000
    print(f'Pack {model_name_a} and {model_name_b}: {avg_step_time}')
Beispiel #9
0
def train_single():
    print('start training single')
    rand_seed = cfg_para.single_rand_seed
    num_epoch = cfg_para.single_num_epoch

    model_type = cfg_para.single_model_type
    num_layer = cfg_para.single_num_layer
    learning_rate = cfg_para.single_learning_rate
    activation = cfg_para.single_activation
    batch_size = cfg_para.single_batch_size
    optimizer = cfg_para.single_opt

    train_dataset = cfg_para.single_train_dataset
    use_tf_timeline = cfg_para.single_use_tb_timeline
    use_cpu = cfg_para.single_use_cpu

    if use_cpu:
        train_device = '/cpu:0'
    else:
        train_device = '/gpu:0'

    ##########################################
    # load dataset
    ##########################################

    img_width, img_height, num_channel, num_class = load_dataset_para(
        train_dataset)
    train_feature_input, train_label_input = load_train_dataset(train_dataset)
    eval_feature_input, eval_label_input = load_eval_dataset(train_dataset)

    ##########################################
    # build model
    ##########################################

    feature_ph = tf.placeholder(tf.float32,
                                [None, img_width, img_height, num_channel])
    label_ph = tf.placeholder(tf.int64, [None, num_class])

    model_name_abbr = np.random.choice(rand_seed, 1, replace=False).tolist()

    dm = ModelImporter(model_type,
                       str(model_name_abbr.pop()),
                       num_layer,
                       img_height,
                       img_width,
                       num_channel,
                       num_class,
                       batch_size,
                       optimizer,
                       learning_rate,
                       activation,
                       batch_padding=False)

    model_entity = dm.get_model_entity()
    model_logit = model_entity.build(feature_ph, is_training=True)
    train_op = model_entity.train(model_logit, label_ph)
    eval_op = model_entity.evaluate(model_logit, label_ph)

    ##########################################
    # train model
    ##########################################

    step_time = 0
    step_count = 0

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.allow_soft_placement = True

    if train_dataset == 'imagenet':
        image_list = sorted(os.listdir(train_feature_input))

    overall_time_start = timer()
    with tf.device(train_device):
        with tf.Session(config=config) as sess:
            sess.run(tf.global_variables_initializer())
            num_batch = train_label_input.shape[0] // batch_size

            for e in range(num_epoch):
                for i in range(num_batch):
                    print('epoch %d / %d, step %d / %d' %
                          (e + 1, num_epoch, i + 1, num_batch))

                    if i != 0:
                        start_time = timer()

                    batch_offset = i * batch_size
                    batch_end = (i + 1) * batch_size
                    if train_dataset == 'imagenet':
                        batch_list = image_list[batch_offset:batch_end]
                        train_feature_batch = load_imagenet_raw(
                            train_feature_input, batch_list, img_height,
                            img_width)
                    else:
                        train_feature_batch = train_feature_input[
                            batch_offset:batch_end]

                    train_label_batch = train_label_input[
                        batch_offset:batch_end]

                    if use_tf_timeline:
                        profile_path = cfg_path.profile_path
                        run_options = tf.RunOptions(
                            trace_level=tf.RunOptions.FULL_TRACE)
                        run_metadata = tf.RunMetadata()
                        sess.run(train_op,
                                 feed_dict={
                                     feature_ph: train_feature_batch,
                                     label_ph: train_label_batch
                                 },
                                 options=run_options,
                                 run_metadata=run_metadata)
                        trace = timeline.Timeline(
                            step_stats=run_metadata.step_stats)
                        trace_file = open(
                            profile_path + '/' + str(model_type) + '-' +
                            str(batch_size) + '-' + str(i) + '.json', 'w')
                        trace_file.write(
                            trace.generate_chrome_trace_format(
                                show_dataflow=True, show_memory=True))
                    else:
                        sess.run(train_op,
                                 feed_dict={
                                     feature_ph: train_feature_batch,
                                     label_ph: train_label_batch
                                 })

                    if i != 0:
                        end_time = timer()
                        dur_time = end_time - start_time
                        print("step time:", dur_time)
                        step_time += dur_time
                        step_count += 1

            acc_avg = sess.run(eval_op,
                               feed_dict={
                                   feature_ph: eval_feature_input,
                                   label_ph: eval_label_input
                               })

    print('evaluation accuracy:{}'.format(acc_avg))

    overall_time_end = timer()
    overall_time = overall_time_end - overall_time_start

    print(
        f'overall training time (s):{overall_time}, average step time (ms):{step_time / step_count * 1000}'
    )
Beispiel #10
0
def train_model(job_id):
    model_type_list = cfg_para.multi_model_type
    num_layer_list = cfg_para.multi_num_layer
    activation_list = cfg_para.multi_activation
    batch_size_list = cfg_para.multi_batch_size
    learning_rate_list = cfg_para.multi_learning_rate
    optimizer_list = cfg_para.multi_opt

    model_type = model_type_list[job_id]
    num_layer = num_layer_list[job_id]
    activation = activation_list[job_id]
    batch_size = batch_size_list[job_id]
    learning_rate = learning_rate_list[job_id]
    optimizer = optimizer_list[job_id]

    num_epoch = cfg_para.multi_num_epoch
    train_dataset = cfg_para.multi_train_dataset
    use_tf_timeline = cfg_para.multi_use_tb_timeline
    use_cpu = cfg_para.multi_use_cpu

    if use_cpu:
        train_device = '/cpu:0'
    else:
        train_device = '/gpu:0'

    model_name = '{0}-{1}-{2}-{3}-{4}-{5}-{6}-{7}'.format(
        job_id, model_type, num_layer, batch_size, learning_rate, optimizer,
        num_epoch, train_dataset)

    ##########################################
    # load dataset
    ##########################################

    img_width, img_height, num_channel, num_class = load_dataset_para(
        train_dataset)
    train_feature_input, train_label_input = load_train_dataset(train_dataset)

    ##########################################
    # build model
    ##########################################

    features = tf.placeholder(tf.float32,
                              [None, img_width, img_height, num_channel])
    labels = tf.placeholder(tf.int64, [None, num_class])

    dm = ModelImporter(model_type,
                       str(job_id),
                       num_layer,
                       img_height,
                       img_width,
                       num_channel,
                       num_class,
                       batch_size,
                       optimizer,
                       learning_rate,
                       activation,
                       batch_padding=False)

    model_entity = dm.get_model_entity()
    model_logit = model_entity.build(features, is_training=True)
    train_op = model_entity.train(model_logit, labels)

    ##########################################
    # train model
    ##########################################

    step_time = 0
    step_count = 0

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.allow_soft_placement = True

    if train_dataset == 'imagenet':
        image_list = sorted(os.listdir(train_feature_input))

    with tf.device(train_device):
        with tf.Session(config=config) as sess:
            sess.run(tf.global_variables_initializer())
            num_batch = train_label_input.shape[0] // batch_size

            for e in range(num_epoch):
                for i in range(num_batch):
                    print('epoch %d / %d, step %d / %d' %
                          (e + 1, num_epoch, i + 1, num_batch))

                    if i != 0:
                        start_time = timer()

                    batch_offset = i * batch_size
                    batch_end = (i + 1) * batch_size
                    if train_dataset == 'imagenet':
                        batch_list = image_list[batch_offset:batch_end]
                        train_feature_batch = load_imagenet_raw(
                            train_feature_input, batch_list, img_height,
                            img_width)
                    else:
                        train_feature_batch = train_feature_input[
                            batch_offset:batch_end]

                    train_label_batch = train_label_input[
                        batch_offset:batch_end]

                    if use_tf_timeline:
                        profile_path = cfg_path.profile_path
                        run_options = tf.RunOptions(
                            trace_level=tf.RunOptions.FULL_TRACE)
                        run_metadata = tf.RunMetadata()
                        sess.run(train_op,
                                 feed_dict={
                                     features: train_feature_batch,
                                     labels: train_label_batch
                                 },
                                 options=run_options,
                                 run_metadata=run_metadata)

                        trace = timeline.Timeline(
                            step_stats=run_metadata.step_stats)
                        trace_file = open(
                            profile_path + '/' + str(model_type) + '-' +
                            str(batch_size) + '-' + str(i) + '.json', 'w')
                        trace_file.write(
                            trace.generate_chrome_trace_format(
                                show_dataflow=True, show_memory=True))
                    else:
                        sess.run(train_op,
                                 feed_dict={
                                     features: train_feature_batch,
                                     labels: train_label_batch
                                 })

                    if i != 0:
                        end_time = timer()
                        dur_time = end_time - start_time
                        print("step time:", dur_time)
                        step_time += dur_time
                        step_count += 1

    step_time_result = f'average step time (ms) of {model_name}: {step_time / step_count * 1000}'
    return step_time_result