def hyperparameter_search(hp_domain, n_iter, n_replicas, save_dir, model):
    """ Random hyperparameter search on DNN model """
    dataset = utils.load_cifar(USE_CIFAR100)
    start_time = time.time()
    gpu_devices = utils.get_available_gpus()

    threads = []
    for rep_id in range(n_replicas):
        threads.append(threading.Thread(target=_hp_search_replica, kwargs={'start_time': start_time, 'gpu_devices': gpu_devices, 'hp_domain': hp_domain,
                                                                           'save_dir': save_dir, 'dataset': dataset, 'rep_id': rep_id, 'n_iter': n_iter, 'model': model}))
        threads[-1].start()

    for thread in threads:
        thread.join()

    # Look for best hyperparameter set we found
    best_results = {'acc': float('-inf'), 'model_name': None, 'hp': None}
    for root, dirs, files in os.walk(save_dir):
        for d in dirs:
            if re.match('model_([0-9]+)_([0-9]+)', d) is not None:
                results = np.load(os.path.join(root, d, 'results.npy')).tolist()
                if best_results['acc'] < results['acc']:
                    best_results = results
                    best_results['model_name'] = d

    print('\n\nHyperparameter search done!\n\tbest_acc=%4f\n\tbest_model_name=%s' % (best_results['acc'], best_results['model_name']) + '\n' + '#' * 100)
    return best_results
Esempio n. 2
0
    def __init__(self, in_steps, out_steps, img_shape, devices, filters=[32,1], num_blocks=1, mode="train",
                 starter_learning_rate=0.001, decay_step=500, decay_rate=1.0, verbose_step=1):
        self.in_steps = in_steps
        self.out_steps = out_steps
        self.img_height, self.img_width = img_shape
        self.num_blocks = num_blocks
        self.input_shape = [self.img_height, self.img_width, 1]
        self.filters = filters

        self.starter_learning_rate = starter_learning_rate
        self.devices = devices
        self.gpus = len(devices)
        self.decay_step = decay_step
        self.decay_rate = decay_rate
        self.global_step = None
        self.learning_rate = None
        self.keep_rate = None
        self.verbose_step = verbose_step
        self.mode = mode

        self.x = None
        self.pw = None
        self.y = None
        self.y_pw = None
        self.y_hat = None

        self.is_training = None
        self.loss = None
        self.loss_1 = None

        self.gpus_names = get_available_gpus()[:self.gpus]
        self.block_idx_per_gpu = get_block_idx_per_gpu(self.num_blocks, self.gpus)
Esempio n. 3
0
def arg_parse():
    # python train.py --mode train --data_dir sequence_data --gpus 4 --num_blocks 4 --batch_size 1 --train_epochs 20 --lr 0.001 --verbose_step 10
    parser = argparse.ArgumentParser()
    parser.add_argument("--mode", default="train")
    parser.add_argument("--data_dir",
                        default=gen_data.sequence_data_dir,
                        help="data dir")
    parser.add_argument("--gpus",
                        default=len(get_available_gpus()),
                        type=int,
                        help="number of gpus")
    parser.add_argument("--num_blocks",
                        default=1,
                        type=int,
                        help="depth of network")
    parser.add_argument("--batch_size", default=1, type=int, help="batch size")
    parser.add_argument("--train_epochs",
                        default=50,
                        type=int,
                        help="train epochs")
    parser.add_argument("--lr",
                        default=0.00002,
                        type=float,
                        help="starter learning rate")
    parser.add_argument("--verbose_step",
                        default=10,
                        type=int,
                        help="verbose step")
    return parser.parse_args()
Esempio n. 4
0
    def __init__(self,
                 stage,
                 croplen,
                 nclasses,
                 optimization=None,
                 momentum=None,
                 reuse_variable=False):
        self.stage = stage
        self.croplen = croplen
        self.nclasses = nclasses
        self.dataloader = None
        self.queue_coord = None
        self.queue_threads = None

        self._optimization = optimization
        self._momentum = momentum
        self.summaries = []
        self.towers = []
        self._train = None
        self._reuse = reuse_variable

        self._accum = None
        self._init = None
        self.small_chunk = 1
        self.nccl = False
        self.replica = False
        self.devices = digits.get_available_gpus(
        )  # it will append cpu further if empty
        self.gpus = len(self.devices)
        self._nesterov = False
Esempio n. 5
0
    def train(self, xs1, xs2, scores):
        global_step = tf.train.get_or_create_global_step()
        lr = noam_scheme(self.context.lr, global_step,
                         self.context.warmup_steps)
        optimizer = tf.train.AdamOptimizer(lr)
        gpus = get_available_gpus()

        if gpus:
            num_gpu = len(gpus)
            assert self.context.hparams.batch_size % num_gpu == 0

            xs1s, xs2s = tf.split(xs1, num_gpu, axis=0), tf.split(xs2,
                                                                  num_gpu,
                                                                  axis=0)
            scoress = tf.split(scores, num_gpu, axis=0)

            tower_grads = []
            losses = []
            with tf.variable_scope(tf.get_variable_scope()) as scope:
                list_predictions = []
                for i in range(num_gpu):
                    with tf.device("/gpu:%d" % i):
                        with tf.name_scope("tower_%d" % i):
                            predictions = self._get_prediction(
                                xs1s[i], xs2s[i])
                            list_predictions.append(predictions)
                            # square loss
                            partial_loss = tf.reduce_sum(tf.squared_difference(
                                predictions, scoress[i]),
                                                         name="loss")
                            losses.append(partial_loss)
                            tf.get_variable_scope().reuse_variables()
                            grad = get_gradients_by_loss_and_optimizer(
                                partial_loss, optimizer)
                            tower_grads.append(grad)
                predictions = tf.concat(list_predictions, axis=0)
            loss = tf.reduce_mean(losses)
            grads_and_vars = average_gradients(tower_grads)
        else:
            predictions = self._get_prediction(xs1, xs2)
            loss = tf.reduce_sum(tf.squared_difference(predictions, scores),
                                 name="loss")
            grads_and_vars = get_gradients_by_loss_and_optimizer(
                loss, optimizer)
        train_op = optimizer.apply_gradients(grads_and_vars,
                                             global_step=global_step)

        for g, v in grads_and_vars:
            tf.summary.histogram(v.name, v)
            tf.summary.histogram(v.name + '_grad', g)
        tf.summary.scalar("pred_avg", tf.reduce_mean(predictions))
        tf.summary.scalar("label_avg", tf.reduce_mean(scores))

        tf.summary.scalar('lr', lr)
        tf.summary.scalar("loss", loss)
        tf.summary.scalar("global_step", global_step)

        summaries = tf.summary.merge_all()
        return loss, train_op, global_step, summaries
Esempio n. 6
0
    def train(self, inputs, targets):
        global_step = tf.train.get_or_create_global_step()
        lr = noam_scheme(self._context.lr, global_step,
                         self._context.warmup_steps)
        optimizer = tf.train.AdamOptimizer(lr)
        gpus = get_available_gpus()

        loss_func = self._loss_func_dict.get(self._context.loss_func,
                                             self._get_loss)
        if gpus:
            num_gpu = len(gpus)
            assert self._context.hparams.batch_size % num_gpu == 0

            partial_inputs = [[] for _ in range(num_gpu)]
            for input_tmp in inputs:
                input_tmps = tf.split(input_tmp, num_gpu, axis=0)
                for i in range(num_gpu):
                    partial_inputs[i].append(input_tmps[i])
            targetses = tf.split(targets, num_gpu, axis=0)

            tower_grads = []
            losses = []
            with tf.variable_scope(tf.get_variable_scope()) as scope:
                for i in range(num_gpu):
                    with tf.device("/gpu:%d" % i):
                        with tf.name_scope("tower_%d" % i):
                            partial_loss = loss_func(partial_inputs[i],
                                                     targetses[i])
                            losses.append(partial_loss)
                            tf.get_variable_scope().reuse_variables()
                            grad = get_gradients_by_loss_and_optimizer(
                                partial_loss, optimizer)
                            tower_grads.append(grad)
            loss = tf.reduce_mean(losses)
            grads_and_vars = average_gradients(tower_grads)
        else:
            loss = tf.reduce_mean(loss_func(inputs, targets))
            grads_and_vars = get_gradients_by_loss_and_optimizer(
                loss, optimizer)
        train_op = optimizer.apply_gradients(grads_and_vars,
                                             global_step=global_step)

        for g, v in grads_and_vars:
            if g is None:  # 无梯度
                continue
            tf.summary.histogram(v.name, v)
            tf.summary.histogram(v.name + '_grad', g)
        tf.summary.scalar("pred_avg", tf.reduce_mean(self._outputs))
        tf.summary.scalar("infr_avg", tf.reduce_mean(self._inferences))
        tf.summary.scalar("label_avg", tf.reduce_mean(targets))

        tf.summary.scalar('lr', lr)
        tf.summary.scalar("loss", loss)
        tf.summary.scalar("global_step", global_step)

        summaries = tf.summary.merge_all()
        return loss, train_op, global_step, summaries
Esempio n. 7
0
    def __init__(self, target_bos_id, target_eos_id, params):
        self.params = params
        self.available_gpus = get_available_gpus()
        self.current_gpu_index = 0
        self.total_gpu_num = len(self.available_gpus)
        self.target_bos_id = target_bos_id
        self.target_eos_id = target_eos_id

        print("learning_rate:", self.params.learning_rate)
Esempio n. 8
0
def get_session():
    tf.reset_default_graph()
    tf_config=tf.ConfigProto(
        inter_op_parallelism_threads=1,
        intra_op_parallelism_threads=1
    )
    session=tf.Session(config=tf_config)
    print('AVAILABLE GPUS:',utils.get_available_gpus())
    return session
Esempio n. 9
0
def get_model(pretrained):
    """
    按照机器配置构建合理模型
    :return:
    """
    if get_available_gpus() > 1:
        model = build_model()
        print(pretrained)
        if pretrained:
            my_model.load_weights(pretrained)
        else:
            pass
        model = multi_gpu_model(model, get_available_gpus())

    else:
        model = build_model()
        if pretrained:
            model.load_weights(pretrained)

    return model
Esempio n. 10
0
    def _load_model(self):
        """
        Load model on GPU if available, else on CPU, and get placeholders for input and outputs

        :return: None
        """
        self._logger.info('Loading model...')
        self._logger.debug('creating TF session')
        self._tf_sess = tf.Session(
            graph=self._graph,
            config=tf.ConfigProto(allow_soft_placement=True))
        gpus = get_available_gpus()
        if gpus:
            self._logger.debug('loading graph on GPU')
            with tf.device(gpus[0]):
                new_saver = tf.train.import_meta_graph(self._model_file +
                                                       '.meta',
                                                       clear_devices=True)
                new_saver.restore(self._tf_sess, self._model_file)
                self._graph = self._tf_sess.graph
        else:
            self._logger.debug('loading graph on CPU')
            new_saver = tf.train.import_meta_graph(self._model_file + '.meta',
                                                   clear_devices=True)
            new_saver.restore(self._tf_sess, self._model_file)
            self._graph = self._tf_sess.graph

        self._logger.debug('getting placeholders')
        # get graph input placeholder
        self._input_placeholder = self._graph.get_tensor_by_name(
            'image_tensor:0')
        # get output placeholders
        self._output_nb_detections = self._graph.get_tensor_by_name(
            'num_detections:0')
        self._output_classes = self._graph.get_tensor_by_name(
            'detection_classes:0')
        self._output_boxes = self._graph.get_tensor_by_name(
            'detection_boxes:0')
        self._output_scores = self._graph.get_tensor_by_name(
            'detection_scores:0')
        self._logger.info('... model loaded')
Esempio n. 11
0
    def __init__(self,
                 model_dir,
                 data_directory,
                 data_format="NHWC",
                 lr=0.001,
                 n_gpus=2,
                 n_fold=5,
                 seed=42,
                 save_best=5,
                 **kwargs):
        """
        High level class to perform multi GPU training with tf.contrib.distribute.MirroredStrategy
        and tf.Estimator.
        The base models are built using tf.slim.
        All the data is processed using tf.data.Dataset and tf.image.
        The preprocessing currently runs on the CPU (optimal?)
        Additionally built ResNet with NCHW format support for (potentially) faster GPU and MKL optimised CPU operations
        Currently NCHW format support is experimental and the speed up is minor (about 10%)


        :param model_dir:
        :param data_directory:
        :param data_format:
        :param lr:
        :param n_gpus:
        :param n_fold:
        :param seed:
        """

        if data_format in ["NCHW", "NHWC"]:
            self.data_format = data_format
        else:
            raise ValueError(
                f"Unknown data format {data_format}. Has to be either NCHW or NHWC"
            )

        if "weight_decay" in kwargs:
            self.weight_decay = kwargs["weight_decay"]
        else:
            self.weight_decay = WEIGHT_DECAY

        if "batch_norm_decay" in kwargs:
            self.batch_norm_decay = kwargs["weight_decay"]
        else:
            self.batch_norm_decay = BATCH_NORM_DECAY

        if "batch_norm_epsilon" in kwargs:
            self.batch_norm_epsilon = kwargs["batch_norm_epsilon"]
        else:
            self.batch_norm_epsilon = BATCH_NORM_EPSILON

        if "batch_norm_scale" in kwargs:
            self.batch_norm_scale = kwargs["batch_norm_scale"]
        else:
            self.batch_norm_scale = BATCH_NORM_SCALE

        if "output_stride" in kwargs:
            self.output_stride = kwargs["output_stride"]
        else:
            self.output_stride = OUTPUT_STRIDE

        if "base_depth" in kwargs:
            self.base_depth = kwargs["base_depth"]
        else:
            self.base_depth = BASE_DEPTH

        if "input_shape" in kwargs:
            self.input_shape = kwargs["input_shape"]
        else:
            self.input_shape = INPUT_SHAPE

        if "n_blocks" in kwargs:
            self.n_blocks = kwargs["n_blocks"]
        else:
            self.n_blocks = (3, 4, 6)

        if "block_type" in kwargs:
            self.block_type = kwargs["block_type"]
        else:
            self.block_type = "bottleneck"

        # Pathing stuff
        self.model_name = model_dir.split('/')[-1]
        self.model_dir = model_dir
        self.data_dir = data_directory

        # Estimator stuff
        available_gpus = get_available_gpus()
        distribution = tf.contrib.distribute.MirroredStrategy(
            devices=available_gpus[:n_gpus])
        self.config = tf.estimator.RunConfig(
            save_checkpoints_steps=500,
            train_distribute=distribution,
            save_summary_steps=0  # running summary manually
        )

        # Additional args to fine tune training at high level
        self.data_format = data_format
        self.n_gpus = n_gpus
        self.n_folds = n_fold
        self.seed = seed
        self.lr = lr

        self.save_best = save_best

        _prepare_directory(self.model_dir, self.n_folds)

        self.skf = StratifiedKFold(n_splits=self.n_folds,
                                   shuffle=True,
                                   random_state=self.seed)
Esempio n. 12
0
    def __init__(self, config, cc_config, model_config=None):
        self.config = config
        self.cc_config = cc_config
        self.model_dir = config.model_dir
        self.cc_config.model_dir = config.model_dir

        self.model_config = model_config
        if self.model_config:
            self.model_config.model_dir = config.model_dir

        self.save_model_dir = os.path.join(self.model_dir, 'checkpoints')
        if not os.path.exists(self.save_model_dir):
            os.mkdir(self.save_model_dir)

        self.summary_dir = os.path.join(self.model_dir, 'summaries')
        if not os.path.exists(self.summary_dir):
            os.mkdir(self.summary_dir)

        self.load_path = config.load_path
        self.use_gpu = config.use_gpu

        #This tensor controls batch_size for all models
        #Not expected to change during training, but during testing it can be
        #helpful to change it

        self.batch_size = tf.placeholder_with_default(self.config.batch_size,
                                                      [],
                                                      name='batch_size')

        loader_batch_size = config.num_devices * config.batch_size

        #Always need to build CC
        print('setting up CausalController')
        cc_batch_size = config.num_devices * self.batch_size  #Tensor/placeholder
        self.cc = CausalController(cc_batch_size, cc_config)
        self.step = self.cc.step

        #Data
        print('setting up data')
        self.data = DataLoader(self.cc.label_names, config)

        if self.cc_config.is_pretrain or self.config.build_pretrain:
            print('setup pretrain')
            #queue system to feed labels quickly. This does not queue images
            label_queue = self.data.get_label_queue(loader_batch_size)
            self.cc.build_pretrain(label_queue)

        #Build Model
        if self.model_config:
            #Will build both gen and discrim
            self.model = self.config.Model(self.batch_size, self.model_config)

            #Trainer step is defined as cc.step+model.step
            #e.g. 10k iter pretrain and 100k iter image model
            #will have image summaries at 100k but trainer model saved at Model-110k
            self.step += self.model.step

            # This queue holds (image,label) pairs, and is used for training conditional GANs
            data_queue = self.data.get_data_queue(loader_batch_size)

            self.real_data_by_gpu = distribute_input_data(
                data_queue, config.num_gpu)
            self.fake_data_by_gpu = distribute_input_data(
                self.cc.label_dict, config.num_gpu)

            with tf.variable_scope('tower'):
                for gpu in get_available_gpus():
                    print('using device:', gpu)

                    real_data = self.real_data_by_gpu[gpu]
                    fake_data = self.fake_data_by_gpu[gpu]
                    tower = gpu.replace('/', '').replace(':', '_')

                    with tf.device(gpu), tf.name_scope(tower):
                        #Build num_gpu copies of graph: inputs->gradient
                        #Updates self.tower_dict
                        self.model(real_data, fake_data)

                    #allow future gpu to use same variables
                    tf.get_variable_scope().reuse_variables()

            if self.model_config.is_train or self.config.build_train:
                self.model.build_train_op()
                self.model.build_summary_op()

        else:
            print('Image model not built')

        self.saver = tf.train.Saver(keep_checkpoint_every_n_hours=2)
        self.summary_writer = tf.summary.FileWriter(self.summary_dir)

        print('trainer.model_dir:', self.model_dir)
        gpu_options = tf.GPUOptions(allow_growth=True,
                                    per_process_gpu_memory_fraction=0.333)
        sess_config = tf.ConfigProto(allow_soft_placement=True,
                                     gpu_options=gpu_options)

        sv = tf.train.Supervisor(logdir=self.save_model_dir,
                                 is_chief=True,
                                 saver=self.saver,
                                 summary_op=None,
                                 summary_writer=self.summary_writer,
                                 save_model_secs=300,
                                 global_step=self.step,
                                 ready_for_local_init_op=None)
        self.sess = sv.prepare_or_wait_for_session(config=sess_config)

        if cc_config.pt_load_path:
            print('Attempting to load pretrain model:', cc_config.pt_load_path)
            self.cc.load(self.sess, cc_config.pt_load_path)

            print('Check tvd after restore')
            info = crosstab(self, report_tvd=True)
            print('tvd after load:', info['tvd'])

            #save copy of cc model in new dir
            cc_step = self.sess.run(self.cc.step)
            self.cc.saver.save(self.sess, self.cc.save_model_name, cc_step)

        if config.load_path:  #Declare loading point
            pnt_str = 'Loaded variables at ccStep:{}'
            cc_step = self.sess.run(self.cc.step)
            pnt_str = pnt_str.format(cc_step)
            print('pntstr', pnt_str)
            if self.model_config:
                pnt_str += ' imagemodelStep:{}'
                model_step = self.sess.run
                pnt_str = pnt_str.format(model_step)
            print(pnt_str)

        #PREPARE training:
        #TODO save as Variables so they are restored to same values when load model
        fixed_batch_size = 256  #get this many fixed z values

        self.fetch_fixed_z = {n.z: n.z for n in self.cc.nodes}
        if model_config:
            self.fetch_fixed_z[self.model.z_gen] = self.model.z_gen

        #feed_dict that ensures constant inputs
        #add feed_fixed_z[self.cc.Male.label]=1*ones() to intervene
        self.feed_fixed_z = self.sess.run(self.fetch_fixed_z,
                                          {self.batch_size: fixed_batch_size})
parser.add_argument('--print_freq', type=int, default=50)
parser.add_argument('--eval_freq', type=int, default=5)
parser.add_argument('--lr', type=float, default=1e-1)
parser.add_argument('--decay', type=float, default=5e-4)
parser.add_argument('--savedir', type=str, default=None)
parser.add_argument('--mode', type=str, default='train')
parser.add_argument('--n_gpus', type=int, default=None)
args = parser.parse_args()

os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'
savedir = ('./results/%d_%d' % (args.depth, args.K)) \
        if args.savedir is None else args.savedir
if not os.path.isdir(savedir):
    os.makedirs(savedir)

available_gpus = get_available_gpus()
n_gpus = len(available_gpus) if args.n_gpus is None else args.n_gpus
available_gpus = available_gpus[:n_gpus]
print ('GPUs to be used: ' + str(available_gpus))

batch_size = args.batch_size
n_train_batches = NUM_TRAIN / batch_size
n_test_batches = NUM_TEST / args.batch_size
with tf.device('/cpu:0'):
    x, y = cifar10_input(batch_size, True)
    xs = tf.split(x, n_gpus, axis=0)
    ys = tf.split(y, n_gpus, axis=0)

    x, y = cifar10_input(batch_size, False)
    txs = tf.split(x, n_gpus, axis=0)
    tys = tf.split(y, n_gpus, axis=0)
Esempio n. 14
0
import subprocess
import tensorflow as tf
from utils import get_available_gpus

GPUS = 4
GPUS = min(GPUS, len(get_available_gpus()))
python_cmd = "python"


def run_server(idx):
    subprocess.call([python_cmd, "helper_server.py", str(GPUS), str(idx)])


processes = []
for i in range(1, GPUS):
    p = Process(target=run_server, args=(i, ))
    processes.append(p)

for p in processes:
    p.start()

for p in processes:
    p.join()
Esempio n. 15
0
    model_checkpoint = ModelCheckpoint(model_names, monitor='val_loss', verbose=1, save_best_only=True)
    early_stop = EarlyStopping('val_loss', patience=patience)
    reduce_lr = ReduceLROnPlateau('val_loss', factor=0.1, patience=int(patience / 4), verbose=1)


    class MyCbk(keras.callbacks.Callback):
        def __init__(self, model):
            keras.callbacks.Callback.__init__(self)
            self.model_to_save = model

        def on_epoch_end(self, epoch, logs=None):
            fmt = checkpoint_models_path + 'model.%02d-%.4f.hdf5'
            self.model_to_save.save(fmt % (epoch, logs['val_loss']))


    num_gpu = len(get_available_gpus())
    if num_gpu >= 2:
        with tf.device("/cpu:0"):
            # Load our model, added support for Multi-GPUs
            model = build_model()
            if pretrained_path is not None:
                model.load_weights(pretrained_path)

        new_model = multi_gpu_model(model, gpus=num_gpu)
        # rewrite the callback: saving through the original model and not the multi-gpu model.
        model_checkpoint = MyCbk(model)
    else:
        new_model = build_model()
        if pretrained_path is not None:
            new_model.load_weights(pretrained_path)
Esempio n. 16
0
from keras import backend as K
import tensorflow as tf
import h5py
import cv2
import numpy as np
import matplotlib; matplotlib.use('Agg')
import matplotlib.pyplot as plt


# In[5]:


from keras.backend.tensorflow_backend import set_session
from utils import limited_gpu_memory_session, get_available_gpus
set_session(limited_gpu_memory_session(0.95))
print(get_available_gpus())


# In[6]:


DATA_DIR = '/home/Drive2/rishabh/'
INIT_WEIGHTS = os.path.join(DATA_DIR, 'init_weights_omniglot.hdf5')
CHECKPOINTED_WEIGHTS = os.path.join(DATA_DIR, 'checkpointed_weights_omniglot.hdf5')


# In[7]:


### Load the test and train datasets
Esempio n. 17
0
import tensorflow as tf
from collections import namedtuple
from utils import get_available_gpus

ALL_DEVICES = get_available_gpus()
NUM_DEVICES = 3

if len(ALL_DEVICES) > NUM_DEVICES:
    ALL_DEVICES = ALL_DEVICES[-NUM_DEVICES:]

CHECKPOINT_DIR = './checkpoint/{}_{}'
DICTIONARY_DIR = './data/{}/dictionary.pkl'

TRAIN_PARAMS = {
    'batch_size': 128 * max(1, len(ALL_DEVICES)),
    'epochs': 1000,
    'emb_size': 50,
    'learning_rate': 0.01,
    'ng_sample': 25,
    'buffer_size': 128,
    'min_count': 5,
    'max_count': 10000,
    'decay_steps': -1,
    'decay_rate': 0.01,
    'lower_gradient': -10,
    'upper_gradient': 10,
}

RUN_CONFIG = {
    'devices': ALL_DEVICES,
    'summary_steps': 50,
Esempio n. 18
0
    def create_model(self, obj_UserModel, stage_scope, batch_x=None):

        if batch_x is None:
            self.init_dataloader()
            batch_x = self.dataloader.batch_x
            if self.stage != digits.STAGE_INF:
                batch_y = self.dataloader.batch_y
        else:
            assert self.stage == digits.STAGE_INF
            batch_x = batch_x

        available_devices = digits.get_available_gpus()
        if not available_devices:
            available_devices.append('/cpu:0')

        # available_devices = ['/gpu:0', '/gpu:1'] # DEVELOPMENT : virtual multi-gpu

        # Split the batch over the batch dimension over the number of available gpu's
        if len(available_devices) == 1:
            batch_x_split = [batch_x]
            if self.stage != digits.STAGE_INF:  # Has no labels
                batch_y_split = [batch_y]
        else:
            with tf.name_scope('parallelize'):
                # Split them up
                batch_x_split = tf.split(batch_x,
                                         len(available_devices),
                                         0,
                                         name='split_batch')
                if self.stage != digits.STAGE_INF:  # Has no labels
                    batch_y_split = tf.split(batch_y,
                                             len(available_devices),
                                             0,
                                             name='split_batch')

        # Get global regularizaion loss collection reference as a list named r_loss_global.
        # Now we can edit regularizaion loss collection by operation r_loss_global list
        r_loss_global = tf.get_collection_ref(
            ops.GraphKeys.REGULARIZATION_LOSSES)

        # Note:
        # (In training stage)
        # r_loss_train_bak = [] (a bak to store all tower's regularizaion loss)
        # r_loss_global = (global regularizaion loss)'s reference
        # For each Tower:
        #     empty r_loss_global
        #     Tower.inference (may add regularizaion loss globally)
        #     r_loss_tain_bak += r_loss_global
        #     ...
        #
        # (restore all tower's reg. loss so validation stage could use it)
        # r_loss_global[:] = r_loss_train_bak[:]
        #

        # (In validation stage)
        # r_loss_global = (global regularizaion loss)'s reference
        # r_loss_val_bak = list(r_loss_global)   <= deep copy
        # For each Tower:
        #     empty r_loss_global
        #     parse element name start with 'tower_%d' % dev_i  in r_loss_val_bak
        #         ... and save to r_loss_global
        #
        #     Tower.inference (will not add any regularizaion loss cause reuse=True)
        #     ( Some operations only catch regularizaion losses belong to current tower)
        #     ...
        #

        if self.replica:
            # Save regularizaion loss of all tower in training stage
            if self.stage != digits.STAGE_TRAIN:
                r_loss_val_bak = list(r_loss_global)
            # Create a list to store regularizaion loss
            if self.stage == digits.STAGE_TRAIN:
                r_loss_train_bak = list()

        # Run the user model through the build_model function that should be filled in
        grad_towers = []
        for dev_i, dev_name in enumerate(available_devices):
            with tf.device(dev_name):
                if self.replica:
                    r_loss_global[:] = []
                    if self.stage != digits.STAGE_TRAIN:
                        r_loss_global = [
                            loss for loss in r_loss_val_bak
                            if loss.name.startswith('train/tower_%d' % dev_i)
                        ]

                with tf.name_scope('tower_%d' % dev_i) as scope_tower:
                    if self.stage != digits.STAGE_INF:
                        tower_model = self.add_tower(obj_tower=obj_UserModel,
                                                     x=batch_x_split[dev_i],
                                                     y=batch_y_split[dev_i])
                    else:
                        tower_model = self.add_tower(obj_tower=obj_UserModel,
                                                     x=batch_x_split[dev_i],
                                                     y=None)

                    with tf.variable_scope(
                            'tower_0' if not self.replica else 'tower_%d' %
                            dev_i,
                            reuse=(False if self.replica else dev_i > 0)
                            or self._reuse):
                        tower_model.inference  # touch to initialize

                        # Reuse the variables in this scope for the next tower/device
                        tf.get_variable_scope().reuse_variables()

                        if self.stage == digits.STAGE_INF:
                            # For inferencing we will only use the inference part of the graph
                            continue

                        with tf.name_scope(digits.GraphKeys.LOSS):
                            for loss in self.get_tower_losses(
                                    tower_model, dev_i):
                                tf.add_to_collection(digits.GraphKeys.LOSSES,
                                                     loss['loss'])

                            # Assemble all made within this scope so far. The user can add custom
                            # losses to the digits.GraphKeys.LOSSES collection
                            losses = tf.get_collection(digits.GraphKeys.LOSSES,
                                                       scope=scope_tower)

                            if (self.replica
                                ) and self.stage == digits.STAGE_TRAIN:
                                r_loss_train_bak += r_loss_global

                            losses += ops.get_collection(
                                ops.GraphKeys.REGULARIZATION_LOSSES,
                                scope=None)
                            tower_loss = tf.add_n(losses, name='loss')

                            self.summaries.append(
                                tf.summary.scalar(tower_loss.op.name,
                                                  tower_loss))

                        if self.stage == digits.STAGE_TRAIN:
                            grad_tower_losses = []
                            for loss in self.get_tower_losses(
                                    tower_model, dev_i):
                                # use loss + regularization loss instead of loss only
                                grad_tower_loss = self.optimizer.compute_gradients(
                                    tower_loss, loss['vars'])
                                grad_tower_loss = tower_model.gradientUpdate(
                                    grad_tower_loss)
                                grad_tower_losses.append(grad_tower_loss)
                            grad_towers.append(grad_tower_losses)

        # Assemble and average the gradients from all towers
        if self.stage == digits.STAGE_TRAIN:
            if self.replica:
                r_loss_global[:] = r_loss_train_bak[:]

            grad_accum = []
            grad_averages = []
            n_gpus = len(available_devices)

            if n_gpus == 1:
                n_losses = len(grad_towers[0])
                for loss in xrange(n_losses):
                    if (self.replica):
                        grad_averages.append([grad_towers[0][loss]])
                    else:
                        grad_averages.append(grad_towers[0][loss])
                    for g, _ in grad_towers[0][loss]:
                        grad_accum.append(g)
            else:
                n_losses = len(grad_towers[0])
                for loss in xrange(n_losses):
                    if not self.nccl:
                        if (self.replica):
                            grad_averages.append(
                                average_grads([
                                    grad_towers[gpu][loss]
                                    for gpu in xrange(n_gpus)
                                ]))
                        else:
                            grad_averages.append(
                                average_gradients([
                                    grad_towers[gpu][loss]
                                    for gpu in xrange(n_gpus)
                                ], 0))
                    else:
                        if (self.replica):
                            grad_averages.append(
                                allreduce_gradients_bak([
                                    grad_towers[gpu][loss]
                                    for gpu in xrange(n_gpus)
                                ]))
                        else:
                            grad_averages.append(
                                allreduce_gradients([
                                    grad_towers[gpu][loss]
                                    for gpu in xrange(n_gpus)
                                ], 0))

                    for gpu in xrange(n_gpus):
                        for g, _ in grad_towers[gpu][loss]:
                            grad_accum.append(g)

            apply_gradient_ops = []
            for grad_avg in grad_averages:
                if (self.replica):
                    tmp = []
                    for grad_and_vars in grad_avg:
                        for (g, v) in grad_and_vars:
                            tmp.append((g, v))
                else:
                    tmp = grad_avg

                apply_gradient_ops.append(
                    self.optimizer.apply_gradients(
                        tmp, global_step=self.global_step))

            self._train = apply_gradient_ops
            self._accum = tf.group(*grad_accum)
            if (self.replica):
                self._init = self.get_post_init_ops()
            else:
                self._init = []
Esempio n. 19
0
    def create_model(self, obj_UserModel, stage_scope, batch_x=None):

        if batch_x is None:
            self.init_dataloader()
            batch_x = self.dataloader.batch_x
            if self.stage != digits.STAGE_INF:
                batch_y = self.dataloader.batch_y
        else:
            assert self.stage == digits.STAGE_INF
            batch_x = batch_x

        available_devices = digits.get_available_gpus()
        if not available_devices:
            available_devices.append('/cpu:0')

        # available_devices = ['/gpu:0', '/gpu:1'] # DEVELOPMENT : virtual multi-gpu

        # Split the batch over the batch dimension over the number of available gpu's
        if len(available_devices) == 1:
            batch_x_split = [batch_x]
            if self.stage != digits.STAGE_INF:  # Has no labels
                batch_y_split = [batch_y]
        else:
            with tf.name_scope('parallelize'):
                # Split them up
                batch_x_split = tf.split(0, len(available_devices), batch_x, name='split_batch')
                if self.stage != digits.STAGE_INF:  # Has no labels
                    batch_y_split = tf.split(0, len(available_devices), batch_y, name='split_batch')

        # Run the user model through the build_model function that should be filled in
        grad_towers = []
        for dev_i, dev_name in enumerate(available_devices):
            with tf.device(dev_name):
                current_scope = stage_scope if len(available_devices) == 1 else ('tower_%d' % dev_i)
                with tf.name_scope(current_scope) as scope_tower:

                    if self.stage != digits.STAGE_INF:
                        tower_model = self.add_tower(obj_tower=obj_UserModel,
                                                     x=batch_x_split[dev_i],
                                                     y=batch_y_split[dev_i])
                    else:
                        tower_model = self.add_tower(obj_tower=obj_UserModel,
                                                     x=batch_x_split[dev_i],
                                                     y=None)

                    with tf.variable_scope(digits.GraphKeys.MODEL, reuse=dev_i > 0):
                        tower_model.inference  # touch to initialize

                    if self.stage == digits.STAGE_INF:
                        # For inferencing we will only use the inference part of the graph
                        continue

                    with tf.name_scope(digits.GraphKeys.LOSS):
                        for loss in self.get_tower_losses(tower_model):
                            tf.add_to_collection(digits.GraphKeys.LOSSES, loss['loss'])

                        # Assemble all made within this scope so far. The user can add custom
                        # losses to the digits.GraphKeys.LOSSES collection
                        losses = tf.get_collection(digits.GraphKeys.LOSSES, scope=scope_tower)
                        losses += ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES, scope=None)
                        tower_loss = tf.add_n(losses, name='loss')

                        self.summaries.append(tf.scalar_summary(tower_loss.op.name, tower_loss))

                    # Reuse the variables in this scope for the next tower/device
                    tf.get_variable_scope().reuse_variables()

                    if self.stage == digits.STAGE_TRAIN:
                        grad_tower_losses = []
                        for loss in self.get_tower_losses(tower_model):
                            grad_tower_loss = self.optimizer.compute_gradients(loss['loss'], loss['vars'])
                            grad_tower_loss = tower_model.gradientUpdate(grad_tower_loss)
                            grad_tower_losses.append(grad_tower_loss)
                        grad_towers.append(grad_tower_losses)

        # Assemble and average the gradients from all towers
        if self.stage == digits.STAGE_TRAIN:
            n_gpus = len(available_devices)
            if n_gpus == 1:
                grad_averages = grad_towers[0]
            else:
                with tf.device(available_devices[0]):
                    n_losses = len(grad_towers[0])
                    grad_averages = []
                    for loss in xrange(n_losses):
                        grad_averages.append(average_gradients([grad_towers[gpu][loss] for gpu in xrange(n_gpus)]))
            apply_gradient_ops = []
            for grad_avg in grad_averages:
                apply_gradient_ops.append(self.optimizer.apply_gradients(grad_avg, global_step=self.global_step))
            self._train = apply_gradient_ops
Esempio n. 20
0
    def create_model(self, obj_UserModel, stage_scope, batch_x=None):
        logging.debug('Stage: {}'.format(stage_scope))

        # get batch data
        if batch_x is None:
            self.init_dataloader()
            batch_x = self.dataloader.batch_x
            if self.stage != utils.STAGE_INF:
                batch_y = self.dataloader.batch_y
        else:
            assert self.stage == utils.STAGE_INF
            batch_x = batch_x

        logging.debug('batch_x shape={}'.format(batch_x.get_shape().as_list()))
        if self.stage != utils.STAGE_INF:
            logging.debug('batch_y shape={}'.format(
                batch_y.get_shape().as_list()))

# get avilable gpu list
        available_devices = utils.get_available_gpus()
        logging.debug('GPUs {}'.format(available_devices))
        if not available_devices:
            available_devices.append('/cpu:0')

        # available_devices = ['/gpu:0', '/gpu:1'] # DEVELOPMENT: virtual multi-gpu

        # Split the batch over the batch dimension over the number of available gpu's
        if len(available_devices) == 1:
            batch_x_split = [batch_x]
            if self.stage != utils.STAGE_INF:  # Has no labels
                batch_y_split = [batch_y]
        else:
            with tf.name_scope('parallelize'):
                # Split them up
                batch_x_split = tf.split(batch_x,
                                         len(available_devices),
                                         0,
                                         name='split_batch_x')
                if self.stage != utils.STAGE_INF:  # Has no labels
                    batch_y_split = tf.split(batch_y,
                                             len(available_devices),
                                             0,
                                             name='split_batch_y')

        # Run the user model through the build_model function that should be filled in
        # collect all type of lossess and all gpus
        grad_towers = []
        #with tf.variable_scope(tf.get_variable_scope()):
        for dev_i, dev_name in enumerate(available_devices):
            with tf.device(dev_name):
                current_scope = stage_scope if len(
                    available_devices) == 1 else ('tower_{}'.format(dev_i))
                with tf.name_scope(current_scope) as scope_tower:

                    with tf.variable_scope(utils.GraphKeys.MODEL,
                                           reuse=dev_i > 0 or self._reuse):
                        if self.stage != utils.STAGE_INF:
                            tower_model = self.add_tower(
                                obj_tower=obj_UserModel,
                                x=batch_x_split[dev_i],
                                y=batch_y_split[dev_i])
                        else:
                            tower_model = self.add_tower(
                                obj_tower=obj_UserModel,
                                x=batch_x_split[dev_i],
                                y=None)

                        tower_model.inference  # touch to initialize

                        # Reuse the variables int this scope for the next tower/device
                        tf.get_variable_scope().reuse_variables()

                        if self.stage == utils.STAGE_INF:
                            # For inferencing we will only use the inference part of the graph
                            continue

                        with tf.name_scope(utils.GraphKeys.LOSS):
                            for loss in self.get_tower_losses(tower_model):
                                tf.add_to_collection(utils.GraphKeys.LOSSES,
                                                     loss['loss'])

                            # Assemble all made within this scope so far. The user can add custom
                            # losses to the utils.GraphKeys.LOSSES collection
                            losses = tf.get_collection(utils.GraphKeys.LOSSES,
                                                       scope=scope_tower)
                            #logging.debug('get_collection: graykeys.LOSSES : {}'.format(losses))
                            #logging.debug('get_collection: graykeys.REGULARIZATION_LOSSES : {}'.format(ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES, scope=None)))
                            losses += ops.get_collection(
                                ops.GraphKeys.REGULARIZATION_LOSSES,
                                scope=None)
                            tower_loss = tf.add_n(losses, name='loss')

                            self.summaries.append(
                                scalar_summary(tower_loss.op.name, tower_loss))

                        if self.stage == utils.STAGE_TRAIN:
                            # collect all type of losses on the gpu
                            grad_tower_losses = []
                            # for each type loss
                            for loss in self.get_tower_losses(tower_model):
                                # compute gradients of this gpu
                                grad_tower_loss = self.optimizer.compute_gradients(
                                    loss['loss'], loss['vars'])
                                grad_tower_loss = tower_model.gradientUpdate(
                                    grad_tower_loss)
                                grad_tower_losses.append(grad_tower_loss)
                            grad_towers.append(grad_tower_losses)

        # Assemble and average the gradients from all towers
        if self.stage == utils.STAGE_TRAIN:
            n_gpus = len(available_devices)
            if n_gpus == 1:
                grad_averages = grad_towers[0]
            else:
                with tf.device(available_devices[0]):
                    n_losses = len(grad_towers[0])
                    grad_averages = []
                    # for each loss, averages loss on all gpus
                    for loss in xrange(n_losses):
                        grad_averages.append(
                            average_gradients([
                                grad_towers[gpu][loss]
                                for gpu in xrange(n_gpus)
                            ]))

            apply_gradient_ops = []
            for grad_avg in grad_averages:
                # apply average gradients
                apply_gradient_ops.append(
                    self.optimizer.apply_gradients(
                        grad_avg, global_step=self.global_step))

            # train op, list
            self._train = apply_gradient_ops
Esempio n. 21
0
def train(log_dir, args, hp):
    # create dir
    os.makedirs(log_dir, exist_ok=True)
    checkpoint_dir = os.path.join(log_dir, 'checkpoints')
    event_dir = os.path.join(log_dir, 'events')
    os.makedirs(event_dir, exist_ok=True)
    os.makedirs(checkpoint_dir, exist_ok=True)
    checkpoint_path = os.path.join(checkpoint_dir, 'model_ckpt')
    audio_dir = os.path.join(log_dir, 'train_stats', 'wavs')
    plot_dir = os.path.join(log_dir, 'train_stats', 'plots')
    eval_audio_dir = os.path.join(log_dir, 'eval_stats', 'wavs')
    eval_plot_dir = os.path.join(log_dir, 'eval_stats', 'plots')
    os.makedirs(audio_dir, exist_ok=True)
    os.makedirs(plot_dir, exist_ok=True)
    os.makedirs(eval_audio_dir, exist_ok=True)
    os.makedirs(eval_plot_dir, exist_ok=True)

    # sess config
    config = tf.ConfigProto(
        gpu_options=tf.GPUOptions(force_gpu_compatible=True,
                                  allow_growth=True),
        allow_soft_placement=True,
        log_device_placement=False,
    )

    # how many gpus will be used
    num_gpus = len(utils.get_available_gpus(config))
    controller = "/gpu:0" if num_gpus == 1 else "/cpu:0"

    # create dataset and iterator
    train_dataset = get_dataset(args.train_file,
                                True,
                                hp,
                                batch_size=hp.batch_size * num_gpus)
    val_dataset = get_dataset(args.val_file,
                              False,
                              hp,
                              batch_size=hp.batch_size * num_gpus)
    iterator = tf.data.Iterator.from_structure(train_dataset.output_types,
                                               train_dataset.output_shapes)
    # feeder: inputs, targets, input_lengths, local_condition, global_condition
    next_inputs = iterator.get_next()
    # To Do: multi gpu training
    feeder = get_inputs(next_inputs, 1)

    train_init = iterator.make_initializer(train_dataset)
    val_init = iterator.make_initializer(val_dataset)

    # global step
    global_step = tf.Variable(name='global_step',
                              initial_value=-1,
                              trainable=False,
                              dtype=tf.int64)
    global_val_step = tf.Variable(name='global_val_step',
                                  initial_value=-1,
                                  trainable=False,
                                  dtype=tf.int64)
    global_val_step_op = tf.assign_add(global_val_step,
                                       1,
                                       name='global_val_step_add')
    # apply ema to variable
    ema = tf.train.ExponentialMovingAverage(decay=hp.ema_decay)
    # create model
    # use multi gpu to train
    train_model = create_train_model(feeder, ema, hp, global_step)
    eval_model = create_eval_model(feeder, hp)

    # save info
    saver = tf.train.Saver(max_to_keep=5)
    train_stats = add_stats(train_model)
    train_loss_window = ValueWindow(100)
    val_loss_window = ValueWindow(100)

    with tf.Session(config=config) as sess:
        sess.run(tf.local_variables_initializer())
        sess.run(tf.global_variables_initializer())

        summary_writer = tf.summary.FileWriter(event_dir, sess.graph)

        # restore from checkpoint
        if args.restore_step is not None:
            restore_path = '{}-{}'.format(checkpoint_path, args.restore_step)
            # we don't load the ema to continue training, that is just for evaluating
            saver.restore(sess, restore_path)
            print('Resuming from checkpoint: {}...'.format(args.restore_step))
        else:
            print('Start new training....')

        for epoch in range(args.epochs):
            sess.run(train_init)
            while True:
                try:
                    start_time = time.time()
                    step, loss, _, = sess.run(
                        [global_step, train_model.loss, train_model.optimize])
                    train_loss_window.append(loss)
                    if step % 10 == 0:
                        message = 'Epoch {:4d} Train Step {:7d} [{:.3f} sec/step step_loss={:.5f} avg_loss={:.5f}]'.format(
                            epoch, step,
                            time.time() - start_time, loss,
                            train_loss_window.average)
                        print(message)

                    if step % args.checkpoint_interval == 0:
                        saver.save(sess, checkpoint_path, step)
                        save_log(sess, step, train_model, plot_dir, audio_dir,
                                 hp)

                    if step % args.summary_interval == 0:
                        print('Writing summary at step {}'.format(step))
                        summary_writer.add_summary(sess.run(train_stats), step)

                    sys.stdout.flush()
                except tf.errors.OutOfRangeError:
                    break

            sess.run(val_init)
            while True:
                try:
                    start_time = time.time()
                    loss = sess.run(train_model.loss)
                    val_loss_window.append(loss)

                    step = sess.run(global_val_step_op)
                    message = 'Epoch {:4d} Val Step {:7d} [{:.3f} sec/step step_loss={:.5f} avg_loss={:.5f}]'.format(
                        epoch, step,
                        time.time() - start_time, loss,
                        val_loss_window.average)
                    print(message)

                    if step % args.eval_interval == 0:
                        eval_step(eval_model, sess, step, eval_plot_dir,
                                  eval_audio_dir)

                    if step % args.summary_val_interval == 0:
                        add_test_stats(summary_writer, step, loss)

                    sys.stdout.flush()
                except tf.errors.OutOfRangeError:
                    break
Esempio n. 22
0
def get_model_fn(features, labels, mode, params):
    """Returns a function that will build the TargetedLearning framework."""
    """Model body.

	Args:
		features: a list of tensors
		labels: a list of tensors
		mode: ModeKeys.TRAIN or EVAL
		params: Hyperparameters suitable for tuning
	Returns:
		A EstimatorSpec object.
	"""
    is_training = (mode == tf.estimator.ModeKeys.TRAIN)
    weight_decay = params.weight_decay
    out_lr = 0.1  #params.learning_rate

    train_features = features[0]
    train_labels = labels[0]
    if is_training:
        val_features = features[1]
        val_labels = labels[1]
    else:
        val_features = features[0]
        val_labels = labels[0]

    # channels first (NCHW) is normally optimal on GPU and channels last (NHWC)
    # on CPU. The exception is Intel MKL on CPU which is optimal with
    # channels_last.
    num_gpus = len(utils.get_available_gpus())
    data_format = params.data_format
    if not data_format:
        if num_gpus == 0:
            data_format = 'channels_last'
        else:
            data_format = 'channels_first'

    train_op = []

    # Building the base model
    with tf.compat.v1.variable_scope('base_model') as var_scope:
        if params.dataset == 'mnist':
            base_model = model.BilevelLenet(num_class=params.num_class)
        else:
            base_model = model.BilevelResNet(resnet_size=params.num_layers,
                                             num_classes=params.num_class,
                                             resnet_version=params.version)
        base_model_logits = base_model(train_features, is_training)
        update_ops = tf.compat.v1.get_collection(
            tf.compat.v1.GraphKeys.UPDATE_OPS, var_scope.name)
        extra_update_ops = base_model.get_updates_for(train_features)
        update_ops.extend(extra_update_ops)
        # Get the params of the model
        base_model_params = tf.compat.v1.trainable_variables(
            scope=var_scope.name)

        # Set initial weights
        class_init = np.array([[1.0 / params.num_class]
                               for _ in range(params.num_class)
                               ]).astype(np.float32)
        class_weights = tf.compat.v1.get_variable('class_weight',
                                                  initializer=class_init)

        weight = tf.matmul(
            tf.cast(
                tf.one_hot(train_labels,
                           len(class_init),
                           on_value=1,
                           off_value=0), tf.float32), class_weights)

        # Get the loss of the main model
        base_model_loss, base_model_preds = _loss_fn(
            base_model_logits,
            tf.one_hot(train_labels, params.num_class, on_value=1,
                       off_value=0))
        base_model_loss_reduced = tf.reduce_mean(
            tf.squeeze(weight) * base_model_loss) + weight_decay * tf.add_n(
                [tf.nn.l2_loss(v) for v in base_model_params])

    # Define the outer model's logits, which is the bilevel model
    with tf.compat.v1.variable_scope(
            'bilevel_model', reuse=tf.compat.v1.AUTO_REUSE) as var_scope1:
        base_model.perturb_model_weights(base_model_loss_reduced,
                                         params.learning_rate, var_scope.name)
        target_logits = base_model(val_features, False)
        target_params = tf.compat.v1.trainable_variables(scope=var_scope1.name)
        target_loss, target_preds = _loss_fn(
            target_logits,
            tf.one_hot(val_labels, params.num_class, on_value=1, off_value=0))
        target_loss = tf.reduce_mean(target_loss) + weight_decay * tf.add_n(
            [tf.nn.l2_loss(v) for v in target_params])

    # Calculate the gradients with respect to the class weights and normalize it
    class_weight_gradient = tf.gradients(target_loss, class_weights)
    update_class_weights = tf.clip_by_value(class_weights -
                                            out_lr * class_weight_gradient[0],
                                            clip_value_min=0.0,
                                            clip_value_max=100.0)
    sum_class_weights = tf.reduce_sum(update_class_weights) + 2e-12
    update_class_weights /= sum_class_weights

    # Update the weight every n steps.
    weight_update_hook = utils.WeightUpdateHook1(
        class_weights,
        update_class_weights,
        every_n_steps=10,
        log_every_n_step=params.log_freq)

    # Calculate the base model grads
    base_model_grads = tf.gradients(base_model_loss_reduced, base_model_params)
    base_model_gradvars = zip(base_model_grads, base_model_params)

    boundaries = [
        params.num_batches_per_epoch * x
        for x in np.array([91, 136, 182], dtype=np.int64)
    ]
    staged_lr = [params.learning_rate * x for x in [1, 0.1, 0.01, 0.001]]

    learning_rate = tf.compat.v1.train.piecewise_constant(
        tf.compat.v1.train.get_global_step(), boundaries, staged_lr)

    # Define optimizer
    optimizer = tf.compat.v1.train.MomentumOptimizer(
        learning_rate=learning_rate, momentum=params.momentum)
    # optimizer = tf.train.AdamOptimizer(learning_rate=params.learning_rate)
    train_op.append(
        optimizer.apply_gradients(
            base_model_gradvars,
            global_step=tf.compat.v1.train.get_global_step()))

    # Calculate metrics
    target_accuracy = tf.compat.v1.metrics.accuracy(val_labels,
                                                    target_preds['classes'])
    accuracy = tf.compat.v1.metrics.accuracy(train_labels,
                                             base_model_preds['classes'])
    # The following metrics are for the binary classification scenario.
    # They should be adopted for multiclass classification tasks.
    if params.num_class == 2:
        train_labels_mask = tf.cast(train_labels, tf.bool)
        inverse_train_labels_mask = tf.cast(
            tf.math.logical_not(train_labels_mask), tf.float32)
        inverse_prediction_mask = tf.cast(
            tf.math.logical_not(tf.cast(base_model_preds['classes'], tf.bool)),
            tf.float32)
        recall_minor = tf.compat.v1.metrics.recall(inverse_train_labels_mask,
                                                   inverse_prediction_mask)
        recall_major = tf.compat.v1.metrics.recall(train_labels,
                                                   base_model_preds['classes'])
        precision_minor = tf.compat.v1.metrics.precision(
            inverse_train_labels_mask, inverse_prediction_mask)
        metrics = {
            'obj/accuracy': accuracy,
            'metrics/recall_minor': recall_minor,
            'metrics/recall_major': recall_major,
            'metrics/precision_minor': precision_minor
        }
    else:
        metrics = {'obj/accuracy': accuracy}

    examples_sec_hook = utils.ExamplesPerSecondHook(
        params.train_batch_size, every_n_steps=params.log_freq)

    tensors_to_log = {
        'Target loss': target_loss,
        'Main loss': base_model_loss_reduced,
        'Target accuracy': target_accuracy[1],
        'Main accuracy': accuracy[1],
        'learning_rates': learning_rate,
        'step': tf.compat.v1.train.get_global_step()
    }

    logging_hook = tf.estimator.LoggingTensorHook(tensors=tensors_to_log,
                                                  every_n_iter=params.log_freq)
    train_hooks = [weight_update_hook, logging_hook, examples_sec_hook]

    train_op.extend(update_ops)
    train_op = tf.group(*train_op)

    return tf.estimator.EstimatorSpec(mode=mode,
                                      predictions=target_preds,
                                      loss=base_model_loss_reduced,
                                      train_op=train_op,
                                      training_hooks=train_hooks,
                                      eval_metric_ops=metrics)
Esempio n. 23
0
                'eurosat': ['resnet56']
                }


    attack_names = ['fgsm', 'pgd', 'deepfool', 'bim', 'apgd','cw']

    ### add combinations into queues
    manager = multiprocessing.Manager()
    q = manager.Queue()

    for attack_name in attack_names:
        for dataset in datasets:
            for model_name in model_dict[dataset]:
                q.put((attack_name, dataset, model_name))




    p_list = []

    for i in range(len(get_available_gpus())):
        gpu_id = i
        p = multiprocessing.Process(target=train, args=(str(gpu_id), ))
        p_list.append(p)
        p.start()
    
    for i in p_list:
        i.join()

    print("All processed finished.")
Esempio n. 24
0
                                  patience=int(patience / 4),
                                  verbose=1)

    class MyCbk(keras.callbacks.Callback):
        def __init__(self, model):
            keras.callbacks.Callback.__init__(self)
            self.model_to_save = model

        def on_epoch_end(self, epoch, logs=None):
            fmt = checkpoint_models_path + 'model.%02d-%.4f.hdf5'
            highest_acc = get_highest_acc()
            if float(logs['val_acc']) > highest_acc:
                self.model_to_save.save(fmt % (epoch, logs['val_acc']))

    # Load our model, added support for Multi-GPUs
    num_gpu = len(get_available_gpus())
    if num_gpu >= 2:
        with tf.device("/cpu:0"):
            model = build_model()
            if pretrained_path is not None:
                model.load_weights(pretrained_path)

        new_model = multi_gpu_model(model, gpus=num_gpu)
        # rewrite the callback: saving through the original model and not the multi-gpu model.
        model_checkpoint = MyCbk(model)
    else:
        new_model = build_model()
        if pretrained_path is not None:
            new_model.load_weights(pretrained_path)

    adam = keras.optimizers.Adam(lr=1e-4,
Esempio n. 25
0
def train(data_dict, emebedding_path, language):
    # pre-process train data
    # data_dict = to_normal(data_dict)

    MAX_NB_WORDS = 100000
    max_seq_len = 1000

    # load train data
    raw_docs_train = [data['content'] for data in data_dict]
    labels = [data['steam_weight'] for data in data_dict]

    word_seq, word_index = kt_tokenizer(raw_docs_train, language, MAX_NB_WORDS,
                                        max_seq_len)

    print('{0} reviews in {1}'.format(len(word_seq), language))

    print('loading word embeddings...')
    emebedding_path = EMBEDDING_DIR + embedding_model_path(language)  # TODO
    embeddings_index = {}
    f = codecs.open(emebedding_path, encoding='utf-8')
    for line in f:
        values = line.rstrip().rsplit(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    print('found %s word vectors' % len(embeddings_index))

    y_all = np.array(labels)

    # training params
    batch_size = 256
    num_epochs = 100
    num_gpus = get_available_gpus()

    # model parameters
    num_filters = 32
    embed_dim = 300
    weight_decay = 1e-4
    learning_rate = 0.001

    # output parameters
    num_classes = 4

    # split data
    split_persentage = 0.8
    split_index = int(len(word_seq) * split_persentage)
    word_seq_train = word_seq[:split_index]
    word_seq_test = word_seq[split_index:]
    y_train = y_all[:split_index]
    y_test = y_all[split_index:]

    # embedding matrix
    print('preparing embedding matrix...')
    words_not_found = []
    nb_words = min(MAX_NB_WORDS, len(word_index))
    embedding_matrix = np.zeros((nb_words, embed_dim))
    for word, i in word_index.items():
        if i >= nb_words:
            continue
        embedding_vector = embeddings_index.get(word)
        if (embedding_vector is not None) and len(embedding_vector) > 0:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
        else:
            words_not_found.append(word)
    print('number of null word embeddings: %d' %
          np.sum(np.sum(embedding_matrix, axis=1) == 0))
    print("sample words not found: ", np.random.choice(words_not_found, 20))

    # CNN architecture
    print("Defining CNN ...")
    model = Sequential()
    model.add(
        Embedding(nb_words,
                  embed_dim,
                  weights=[embedding_matrix],
                  input_length=max_seq_len,
                  trainable=False))
    model.add(Conv1D(num_filters, 7, activation='relu', padding='same'))
    model.add(MaxPooling1D(2))
    model.add(Conv1D(num_filters, 7, activation='relu', padding='same'))
    model.add(GlobalMaxPooling1D())
    # model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
    # model.add(Dense(32, activation='relu', kernel_regularizer=regularizers.l2(weight_decay)))
    # model.add(Dropout(0.5))
    model.add(
        Dense(16,
              activation='relu',
              kernel_regularizer=regularizers.l2(weight_decay)))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))

    adam = optimizers.Adam(lr=learning_rate,
                           beta_1=0.9,
                           beta_2=0.999,
                           epsilon=1e-08,
                           decay=0.0)
    try:
        model = multi_gpu_model(model, gpus=num_gpus)
        print("Training using {0} GPUs..".format(num_gpus))
    except:
        print("Training using single GPU or CPU..")
    model.compile(loss='binary_crossentropy',
                  optimizer=adam,
                  metrics=['accuracy'])
    print(model.summary())

    # save model
    filepath = 'models/' + language + '.' + 'weights.ep{epoch:03d}.loss{loss:.3f}.val_loss{val_loss:.3f}.h5'
    checkpoint = ModelCheckpoint(filepath,
                                 monitor='val_loss',
                                 verbose=0,
                                 save_best_only=True,
                                 save_weights_only=False,
                                 mode='auto',
                                 period=1)
    callbacks_list = [checkpoint]

    # model training
    hist = model.fit(word_seq_train,
                     y_train,
                     batch_size=batch_size,
                     epochs=num_epochs,
                     callbacks=callbacks_list,
                     validation_split=0.1,
                     shuffle=True,
                     verbose=2)

    # plot loss
    train_history = hist
    loss = train_history.history['loss']
    val_loss = train_history.history['val_loss']
    plt.title(language + ' ' + 'model')
    plt.plot(loss)
    plt.plot(val_loss)
    plt.legend(['loss', 'val_loss'])
    plt.show()

    # plot y_predict
    plt.title(language + ' ' + 'predict')
    plt.plot(y_test)
    plt.plot(model.predict(word_seq_test))
    plt.legend(['y_text', 'y_predict'])
    plt.show()