Beispiel #1
0
def train(dataset, args):
    # use mask to split train/validation/test
    test_loader = loader = DataLoader(dataset,
                                      batch_size=args.batch_size,
                                      shuffle=True)

    # build model
    model = models.GNNStack(dataset.num_node_features, args.hidden_dim,
                            dataset.num_classes, args)
    scheduler, opt = utils.build_optimizer(args, model.parameters())

    # train
    for epoch in range(args.epochs):
        total_loss = 0
        model.train()
        for batch in loader:
            opt.zero_grad()
            pred = model(batch)
            label = batch.y
            pred = pred[batch.train_mask]
            label = label[batch.train_mask]
            loss = model.loss(pred, label)
            loss.backward()
            opt.step()
            total_loss += loss.item() * batch.num_graphs
        total_loss /= len(loader.dataset)
        print(total_loss)

        if epoch % 10 == 0:
            test_acc = test(loader, model)
            print(test_acc, '  test')
Beispiel #2
0
def train(args=None, param_path=None, **kw):
    if args is None:
        args = kw.get('args')

    if param_path is None:
        if args.param_path is not None:
            param_path = args.param_path
        else:
            param_path = './params/'
        #sys.path.append(param_path)

    model_dict, optimizer_dict, trainer_dict, data_loader_dict = get_param_dicts(
        args)

    trainer = build_trainer(trainer_dict)
    data_loader = build_data_loader(data_loader_dict)
    trainer.bind_data_loader(data_loader)
    # model can be RSLP, RMLP, RCNN ...
    model = build_model(model_dict)
    # optimizer can be BP, TP or CHL optimizer.
    optimizer = build_optimizer(optimizer_dict)
    optimizer.bind_model(model)
    optimizer.bind_trainer(trainer)
    trainer.bind_model(model)
    trainer.bind_optimizer(optimizer)

    trainer.train(
    )  # the model needs some data from data_loader to get response properties.
    model.analyze(data_loader=data_loader)
Beispiel #3
0
def train(dataset, task, args):
    if task == 'graph':
        # graph classification: separate dataloader for test set
        data_size = len(dataset)
        loader = DataLoader(dataset[:int(data_size * 0.8)],
                            batch_size=args.batch_size,
                            shuffle=True)
        test_loader = DataLoader(dataset[int(data_size * 0.8):],
                                 batch_size=args.batch_size,
                                 shuffle=True)
    elif task == 'node':
        # use mask to split train/validation/test
        test_loader = loader = DataLoader(dataset,
                                          batch_size=args.batch_size,
                                          shuffle=True)
    else:
        raise RuntimeError('Unknown task')

    # build model
    model = models.GNNStack(dataset.num_node_features,
                            args.hidden_dim,
                            dataset.num_classes,
                            args,
                            task=task)
    print(model)
    scheduler, opt = utils.build_optimizer(args, model.parameters())

    # train
    best_val_acc = 0
    test_acc = 0

    for epoch in range(args.epochs):
        total_loss = 0
        model.train()
        for batch in loader:
            opt.zero_grad()
            pred = model(batch)
            label = batch.y
            if task == 'node':
                pred = pred[batch.train_mask]
                label = label[batch.train_mask]
            loss = model.loss(pred, label)
            loss.backward()
            opt.step()
            total_loss += loss.item() * batch.num_graphs
        total_loss /= len(loader.dataset)
        print("Loss in Epoch {0}: {1}".format(epoch, total_loss))

        if epoch % 10 == 0:
            val_acc, tmp_test_acc = test(loader, model,
                                         is_validation=True), test(
                                             loader, model)
            if val_acc > best_val_acc:
                best_val_acc = val_acc
                test_acc = tmp_test_acc
            print("Current Best Val Acc {0}, with Test Acc {1}".format(
                best_val_acc, test_acc))

    print('Final Val Acc {0}, Test Acc {1}'.format(val_acc, test_acc))
    def __init__(self, net_type, net_argv, init_path, init_argv, dim_argv,
                 batch_size, opt_argv):
        if net_type == "nn":
            self.graph = tf.Graph()
            nb_dim = dim_argv[0]
            depth, h_dims, act_func = net_argv
            with self.graph.as_default():
                var_init = []
                if not init_path:
                    _j = 0
                    for _i in range(depth - 1):
                        var_init.extend([
                            ("W{}".format(_i), [h_dims[_i], h_dims[_i + 1]],
                             init_argv[_j][0], init_argv[_j][1:]),
                            ("b{}".format(_i), [1, h_dims[_i + 1]],
                             init_argv[_j + 1][0], init_argv[_j + 1][1:])
                        ])
                        _j += 2
                var_map = init_var_map(init_path, var_init)
                self.W = [0] * (depth - 1)
                self.b = [0] * (depth - 1)
                for _i in range(depth - 1):
                    self.W[_i] = tf.Variable(var_map["W{}".format(_i)])
                    self.b[_i] = tf.Variable(var_map["b{}".format(_i)])

                self.x_vec = tf.placeholder(tf.float32, shape=[1, nb_dim])
                self.batch_x_vecs = tf.placeholder(tf.float32,
                                                   shape=[batch_size, nb_dim])
                self.batch_value_labels = tf.placeholder(tf.float32,
                                                         shape=[batch_size, 1])

                self.value_prediction = self.forward(net_type, depth, act_func,
                                                     self.x_vec,
                                                     [self.W, self.b])
                self.batch_value_predictions = self.forward(
                    net_type, depth, act_func, self.batch_x_vecs,
                    [self.W, self.b])

                square_loss_value = tf.square(self.batch_value_labels -
                                              self.batch_value_predictions)
                if opt_argv[-1] == "sum":
                    self.loss_value = tf.reduce_sum(square_loss_value)
                elif opt_argv[-1] == "mean":
                    self.loss_value = tf.reduce_mean(square_loss_value)

                self.opt_value = build_optimizer(opt_argv, self.loss_value)

                #self.init = tf.initialize_all_variables()
                self.init = tf.global_variables_initializer()
        #self.log = "net_type={}\tnet_argv={}\tinit_path={}\tinit_argv={}\tdim_argv={}\tbatch_size={}\topt_argv={}" \
        #    .format(net_type, net_argv, init_path, init_argv, dim_argv, batch_size, opt_argv)
        self.log = "net_type={}\tnet_argv={}\tinit_path={}\tinit_argv={}\tdim_argv={}\tbatch_size={}\topt_argv={}" \
                .format(net_type, net_argv, init_path, init_argv, dim_argv, batch_size, opt_argv)
def train(dataset, task, args):
    # use mask to split train/validation/test
    test_loader = loader = DataLoader(dataset,
                                      batch_size=args.batch_size,
                                      shuffle=True)

    # build model
    if args.model_type != 'APPNP':
        model = models.GNNStack(dataset.num_node_features,
                                args.hidden_dim,
                                dataset.num_classes,
                                args,
                                task=task)
    else:
        alpha = 0.1  # Change here if you need to change alpha
        niter = 10  # Change here if you need to change niterations of Pagerank
        appnp_prop = models.PPRPowerIteration(dataset.data.edge_index, alpha,
                                              niter, args.dropout)
        model = models.APPNP(dataset.num_node_features,
                             args.hidden_dim,
                             dataset.num_classes,
                             appnp_prop,
                             args,
                             task=task)
    scheduler, opt = utils.build_optimizer(args, model.parameters())

    accuracy = []
    # train
    for epoch in range(args.epochs):
        total_loss = 0
        model.train()
        for batch in loader:
            opt.zero_grad()
            pred = model(batch)
            label = batch.y
            pred = pred[batch.train_mask]
            label = label[batch.train_mask]
            loss = model.loss(pred, label)
            loss.backward()
            opt.step()
            total_loss += loss.item() * batch.num_graphs
        total_loss /= len(loader.dataset)
        print('Epoch: ', epoch, 'Training loss: ', total_loss)

        if epoch % 100 == 0:
            test_acc = test(loader, model)
            print('Test acc: ', test_acc)
            accuracy.append([epoch, test_acc])
    test_acc = test(loader, model)
    accuracy.append([args.epochs, test_acc])
    plot_accuracy(np.array(accuracy), args)
    print('Final test acc: ', test_acc)
Beispiel #6
0
def train(dataset, task, args):
    if task == 'graph':
        # graph classification: separate dataloader for test set
        data_size = len(dataset)
        loader = DataLoader(dataset[:int(data_size * 0.8)],
                            batch_size=args.batch_size,
                            shuffle=True)
        test_loader = DataLoader(dataset[int(data_size * 0.8):],
                                 batch_size=args.batch_size,
                                 shuffle=True)
    elif task == 'node':
        # use mask to split train/validation/test
        test_loader = loader = DataLoader(dataset,
                                          batch_size=args.batch_size,
                                          shuffle=True)
    else:
        raise RuntimeError('Unknown task')

    # build model
    model = models.GNNStack(dataset.num_node_features,
                            args.hidden_dim,
                            dataset.num_classes,
                            args,
                            task=task)
    scheduler, opt = utils.build_optimizer(args, model.parameters())
    loss_t = []
    acc = []
    # train
    for epoch in range(args.epochs):
        total_loss = 0
        model.train()
        for batch in loader:
            opt.zero_grad()
            pred = model(batch)
            label = batch.y
            if task == 'node':
                pred = pred[batch.train_mask]
                label = label[batch.train_mask]
            loss = model.loss(pred, label)
            loss.backward()
            opt.step()
            total_loss += loss.item() * batch.num_graphs
        total_loss /= len(loader.dataset)
        loss_t.append(total_loss)
        print(total_loss)

        if epoch % 10 == 0:
            test_acc = test(loader, model)
            acc.append(test_acc)
            print(test_acc, '  test')
    print(loss_t)
    print(acc)
Beispiel #7
0
def main():
    """Main workflow"""
    args = utils.build_args(argparse.ArgumentParser())

    utils.init_logger(args.model_file)

    assert torch.cuda.is_available()
    torch.cuda.set_device(args.gpuid)

    utils.init_random(args.seed)

    utils.set_params(args)
    logger.info("Config:\n%s", pformat(vars(args)))

    fields = utils.build_fields()
    logger.info("Fields: %s", fields.keys())

    logger.info("Load %s", args.train_file)
    train_data = LMDataset(fields, args.train_file, args.sent_length_trunc)
    logger.info("Training sentences: %d", len(train_data))
    logger.info("Load %s", args.valid_file)
    val_data = LMDataset(fields, args.valid_file, args.sent_length_trunc)
    logger.info("Validation sentences: %d", len(val_data))

    fields["sent"].build_vocab(train_data)

    train_iter = utils.build_dataset_iter(train_data, args)
    val_iter = utils.build_dataset_iter(val_data, args, train=False)

    if args.resume and os.path.isfile(args.checkpoint_file):
        logger.info("Resume training")
        logger.info("Load checkpoint %s", args.checkpoint_file)
        checkpoint = torch.load(args.checkpoint_file,
                                map_location=lambda storage, loc: storage)
        es_stats = checkpoint["es_stats"]
        args = utils.set_args(args, checkpoint)
    else:
        checkpoint = None
        es_stats = ESStatistics(args)

    model = utils.build_model(fields, args, checkpoint)
    logger.info("Model:\n%s", model)

    optimizer = utils.build_optimizer(model, args, checkpoint)

    try_train_val(fields, model, optimizer, train_iter, val_iter, es_stats,
                  args)
Beispiel #8
0
def train(dataset, task, args):
    f1 = open(task + "_" + args.model_type+'.txt','w')
    if task == 'graph':
        # graph classification: separate dataloader for test set
        data_size = len(dataset)
        print("==> There are", data_size, "graphs in the dataset.")
        loader = DataLoader(
                dataset[:int(data_size * 0.8)], batch_size=args.batch_size, shuffle=True)
        test_loader = DataLoader(
                dataset[int(data_size * 0.8):], batch_size=args.batch_size, shuffle=True)
    elif task == 'node':
        print("==> There are", dataset.data.edge_index.shape[1], "edges, and", dataset.data.y.shape[0], "nodes in the dataset.")
        # use mask to split train/validation/test
        test_loader = loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True)
    else:
        raise RuntimeError('Unknown task')

    # build model
    model = models.GNNStack(dataset.num_node_features, args.hidden_dim, dataset.num_classes,
                            args, task=task)
    scheduler, opt = utils.build_optimizer(args, model.parameters())

    # train
    for epoch in range(args.epochs):
        total_loss = 0
        model.train()
        for batch in loader:
            opt.zero_grad()
            pred = model(batch)
            label = batch.y
            if task == 'node':
                pred = pred[batch.train_mask]
                label = label[batch.train_mask]
            loss = model.loss(pred, label)
            loss.backward()
            opt.step()
            total_loss += loss.item() * batch.num_graphs
        total_loss /= len(loader.dataset)
        #print(total_loss)

        if epoch % 10 == 0:
            test_acc = test(loader, model)
            print("Epoch {}. Loss: {:.4f}. Test accuracy: {:.4f}".format(
                epoch, total_loss, test_acc))
            f1.write("{} {:.4f} {:.4f}\n".format(
                epoch, total_loss, test_acc))
    f1.close()
Beispiel #9
0
def main():
    # parse args
    args = parse_args()

    # build data_loader
    file_path = args.file_path
    data_loader = build_train_loader(file_path)

    device = torch.device("cpu")
    model = build_model().to(device)

    optimizer = build_optimizer(model, lr=args.lr)
    lr_milestones = [len(data_loader) * m for m in args.lr_milestones]
    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
        optimizer, milestones=lr_milestones, gamma=args.lr_gamma)

    def save_model_checkpoint():
        if args.output_dir:
            checkpoint = {
                'model': model.state_dict(),
                'optimizer': optimizer.state_dict(),
                'lr_scheduler': lr_scheduler.state_dict(),
                'epoch': epoch,
                'args': args
            }
            torch.save(
                checkpoint,
                os.path.join(args.output_dir, 'model_{}.pth'.format(epoch)))
            torch.save(checkpoint,
                       os.path.join(args.output_dir, 'checkpoint.pth'))

    print("Start training")
    start_time = time.time()
    import ipdb
    ipdb.set_trace()
    for epoch in range(args.epochs):
        train_one_epoch(model,
                        optimizer,
                        lr_scheduler,
                        data_loader,
                        epoch,
                        args.print_freq,
                        checkpoint_fn=save_model_checkpoint)

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))
Beispiel #10
0
    def __init__(self, config, device, resume=False):
        self.config = config
        self.cfg_stg = config['strategy']
        self.device = device

        self.model = utils.build_model(config['model'])
        self.model.to(device)

        self.logger = utils.create_logger(self.cfg_stg['save_path'])
        self.tb_logger = SummaryWriter(
            join(self.cfg_stg['save_path'], 'events'))

        self.start_epoch = 1
        if resume:
            self.load_model()
        self.optimizer = utils.build_optimizer(config['strategy'], self.model,
                                               self.start_epoch)
Beispiel #11
0
    def sim(options):
        mt_model, text_processor = SenSim.load(options.model_path, tok_dir=options.tokenizer_path)

        print("Model initialization done!")
        optimizer = build_optimizer(mt_model, options.learning_rate, warump_steps=options.warmup)

        trainer = SenSimEval(model=mt_model, mask_prob=options.mask_prob, optimizer=optimizer, clip=options.clip,
                             fp16=options.fp16)

        pin_memory = torch.cuda.is_available()
        mt_dev_data = dataset.MTDataset(batch_pickle_dir=options.mt_dev_path,
                                        max_batch_capacity=options.total_capacity,
                                        max_batch=int(options.batch / (options.beam_width * 2)),
                                        pad_idx=mt_model.text_processor.pad_token_id(), keep_pad_idx=False)
        dl = data_utils.DataLoader(mt_dev_data, batch_size=1, shuffle=False, pin_memory=pin_memory)

        trainer.eval(mt_dev_iter=dl, saving_path=options.output)
Beispiel #12
0
def train_model(model, loader, args):

    # build model
    scheduler, opt = utils.build_optimizer(args, model.parameters())

    # train
    for epoch in range(args.epochs):
        total_loss = 0
        model.train()
        for batch in loader:
            opt.zero_grad()
            pred = model(batch)
            label = batch.y
            pred = pred[batch.train_mask]
            label = label[batch.train_mask]
            loss = model.loss(pred, label)
            loss.backward()
            opt.step()
            total_loss += loss.item() * batch.num_graphs
        total_loss /= len(loader.dataset)
    # print(total_loss)
    return model
Beispiel #13
0
def final_model_fn(features, labels, mode, params):
    """The model_fn for ConvNet to be used with TPUEstimator.
    Args:
      features: `Tensor` of batched images.
      labels: `Tensor` of labels for the data samples
      mode: one of `tf.estimator.ModeKeys.{TRAIN,EVAL,PREDICT}`
      params: `dict` of parameters passed to the model from the TPUEstimator,
          `params['batch_size']` is always provided and should be used as the
          effective batch size.
    Returns:
      A `TPUEstimatorSpec` for the model
    """
    if isinstance(features, dict):
        features = features['feature']

    # In most cases, the default data format NCHW instead of NHWC should be
    # used for a significant performance boost on GPU/TPU. NHWC should be used
    # only if the network needs to be run on CPU since the pooling operations
    # are only supported on NHWC.
    if FLAGS.data_format == 'channels_first':
        if not FLAGS.transpose_input:
            # channels_first only for GPU
            raise ValueError('The option transpose_input is set to False')
        features = tf.transpose(features, [0, 3, 1, 2])

    if FLAGS.transpose_input and mode != tf.estimator.ModeKeys.PREDICT:
        features = tf.transpose(features, [3, 0, 1, 2])  # HWCN to NHWC

    # Normalize the image to zero mean and unit variance.
    features -= tf.constant(MEAN_RGB, shape=[1, 1, 3], dtype=features.dtype)
    features /= tf.constant(STDDEV_RGB, shape=[1, 1, 3], dtype=features.dtype)

    is_training = (mode == tf.estimator.ModeKeys.TRAIN)
    has_moving_average_decay = (FLAGS.moving_average_decay > 0)
    # This is essential, if using a keras-derived model.
    K.set_learning_phase(is_training)
    tf.logging.info('Using open-source implementation for MnasNet definition.')

    # Override params when necessary
    override_params = utils.get_override_params_dict(FLAGS)

    logits, _ = models.build_model(features,
                                   model_name=FLAGS.model_name,
                                   training=is_training,
                                   override_params=override_params)

    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {
            'classes': tf.argmax(logits, axis=1),
            'probabilities': tf.nn.softmax(logits, name='softmax_tensor')
        }
        return tf.estimator.EstimatorSpec(
            mode=mode,
            predictions=predictions,
            export_outputs={
                'classify': tf.estimator.export.PredictOutput(predictions)
            })

    # If necessary, in the model_fn, use params['batch_size'] instead the batch
    # size flags (--train_batch_size or --eval_batch_size).
    batch_size = params['batch_size']  # pylint: disable=unused-variable

    # Calculate loss, which includes softmax cross entropy and L2 regularization.
    one_hot_labels = tf.one_hot(labels, FLAGS.num_label_classes)
    cross_entropy = tf.losses.softmax_cross_entropy(
        logits=logits,
        onehot_labels=one_hot_labels,
        label_smoothing=FLAGS.label_smoothing)

    # Add weight decay to the loss for non-batch-normalization variables.
    loss = cross_entropy + FLAGS.weight_decay * tf.add_n([
        tf.nn.l2_loss(v) for v in tf.trainable_variables()
        if 'batch_normalization' not in v.name
    ])

    global_step = tf.train.get_global_step()
    if has_moving_average_decay:
        ema = tf.train.ExponentialMovingAverage(
            decay=FLAGS.moving_average_decay, num_updates=global_step)
        ema_vars = tf.trainable_variables() + tf.get_collection('moving_vars')
        for v in tf.global_variables():
            # We maintain mva for batch norm moving mean and variance as well.
            if 'moving_mean' in v.name or 'moving_variance' in v.name:
                ema_vars.append(v)
        ema_vars = list(set(ema_vars))

    host_call = None
    restore_vars_dict = None
    if is_training:
        # Compute the current epoch and associated learning rate from global_step.
        current_epoch = (tf.cast(global_step, tf.float32) /
                         params['steps_per_epoch'])

        scaled_lr = FLAGS.base_learning_rate * (FLAGS.train_batch_size / 256.0)
        learning_rate = utils.build_learning_rate(scaled_lr, global_step,
                                                  params['steps_per_epoch'])
        optimizer = utils.build_optimizer(learning_rate)
        if FLAGS.use_tpu:
            # When using TPU, wrap the optimizer with CrossShardOptimizer which
            # handles synchronization details between different TPU cores. To the
            # user, this should look like regular synchronous training.
            optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)

        # Batch normalization requires UPDATE_OPS to be added as a dependency to
        # the train operation.
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            train_op = optimizer.minimize(loss, global_step)

        if has_moving_average_decay:
            with tf.control_dependencies([train_op]):
                train_op = ema.apply(ema_vars)

        if not FLAGS.skip_host_call:
            # To log the loss, current learning rate, and epoch for Tensorboard, the
            # summary op needs to be run on the host CPU via host_call. host_call
            # expects [batch_size, ...] Tensors, thus reshape to introduce a batch
            # dimension. These Tensors are implicitly concatenated to
            # [params['batch_size']].
            gs_t = tf.reshape(global_step, [1])
            loss_t = tf.reshape(loss, [1])
            lr_t = tf.reshape(learning_rate, [1])
            ce_t = tf.reshape(current_epoch, [1])

            host_call = (train_host_call_fn, [gs_t, loss_t, lr_t, ce_t])

    else:
        train_op = None
        if has_moving_average_decay:
            # Load moving average variables for eval.
            restore_vars_dict = ema.variables_to_restore(ema_vars)

    eval_metrics = None
    if mode == tf.estimator.ModeKeys.EVAL:
        eval_metrics = (eval_metric_fn, [labels, logits])

    num_params = np.sum([np.prod(v.shape) for v in tf.trainable_variables()])
    tf.logging.info('number of trainable parameters: {}'.format(num_params))

    def _scaffold_fn():
        saver = tf.train.Saver(restore_vars_dict)
        return tf.train.Scaffold(saver=saver)

    return tf.contrib.tpu.TPUEstimatorSpec(
        mode=mode,
        loss=loss,
        train_op=train_op,
        host_call=host_call,
        eval_metrics=eval_metrics,
        scaffold_fn=_scaffold_fn if has_moving_average_decay else None)
Beispiel #14
0
    def train(options):
        if not os.path.exists(options.model_path):
            os.makedirs(options.model_path)

        text_processor = TextProcessor(options.tokenizer_path)

        lm_class = ReformerLM if options.reformer else LM
        if options.pretrained_path is None:
            lm = lm_class(text_processor=text_processor,
                          size=options.model_size)
        else:
            lm = lm_class.load(options.pretrained_path)

        if options.reformer:
            lm.config.hidden_dropout_prob = options.dropout
            lm.config.local_attention_probs_dropout_prob = options.dropout
            lm.config.lsh_attention_probs_dropout_prob = options.dropout
        else:
            LMTrainer.config_dropout(lm, options.dropout)

        train_data = dataset.TextDataset(save_cache_dir=options.train_path,
                                         max_cache_size=options.cache_size)
        dev_data = dataset.TextDataset(save_cache_dir=options.dev_path,
                                       max_cache_size=options.cache_size,
                                       load_all=True)

        if options.continue_train:
            with open(os.path.join(options.pretrained_path, "optim"),
                      "rb") as fp:
                optimizer = pickle.load(fp)
        else:
            optimizer = build_optimizer(lm, options.learning_rate,
                                        options.warmup)

        trainer = LMTrainer(model=lm,
                            mask_prob=options.mask_prob,
                            optimizer=optimizer,
                            clip=options.clip)

        collator = dataset.TextCollator(pad_idx=text_processor.pad_token_id())
        train_sampler, dev_sampler = None, None

        pin_memory = torch.cuda.is_available()
        loader = data_utils.DataLoader(train_data,
                                       batch_size=options.batch,
                                       shuffle=False,
                                       pin_memory=pin_memory,
                                       collate_fn=collator,
                                       sampler=train_sampler)
        dev_loader = data_utils.DataLoader(dev_data,
                                           batch_size=options.batch,
                                           shuffle=False,
                                           pin_memory=pin_memory,
                                           collate_fn=collator,
                                           sampler=dev_sampler)

        step, train_epoch = 0, 1
        while step <= options.step:
            print("train epoch", train_epoch)
            step = trainer.train_epoch(data_iter=loader,
                                       dev_data_iter=dev_loader,
                                       saving_path=options.model_path,
                                       step=step)
Beispiel #15
0
def model_fn(features, labels, mode, params):
    """The model_fn to be used with TPUEstimator.

  Args:
    features: `Tensor` of batched images.
    labels: `Tensor` of labels for the data samples
    mode: one of `tf.estimator.ModeKeys.{TRAIN,EVAL,PREDICT}`
    params: `dict` of parameters passed to the model from the TPUEstimator,
        `params['batch_size']` is always provided and should be used as the
        effective batch size.

  Returns:
    A `TPUEstimatorSpec` for the model
  """
    if isinstance(features, dict):
        features = features['feature']

    stats_shape = [1, 1, 3]

    is_training = (mode == tf.estimator.ModeKeys.TRAIN)
    has_moving_average_decay = (FLAGS.moving_average_decay > 0)
    # This is essential, if using a keras-derived model.
    tf.keras.backend.set_learning_phase(is_training)
    tf.logging.info('Using open-source implementation.')
    override_params = {}
    if FLAGS.batch_norm_momentum is not None:
        override_params['batch_norm_momentum'] = FLAGS.batch_norm_momentum
    if FLAGS.batch_norm_epsilon is not None:
        override_params['batch_norm_epsilon'] = FLAGS.batch_norm_epsilon
    if FLAGS.dropout_rate is not None:
        override_params['dropout_rate'] = FLAGS.dropout_rate
    if FLAGS.drop_connect_rate is not None:
        override_params['drop_connect_rate'] = FLAGS.drop_connect_rate
    if FLAGS.num_label_classes:
        override_params['num_classes'] = FLAGS.num_label_classes
    if FLAGS.depth_coefficient:
        override_params['depth_coefficient'] = FLAGS.depth_coefficient
    if FLAGS.width_coefficient:
        override_params['width_coefficient'] = FLAGS.width_coefficient

    def normalize_features(features, mean_rgb, stddev_rgb):
        """Normalize the image given the means and stddevs."""
        features -= tf.constant(mean_rgb,
                                shape=stats_shape,
                                dtype=features.dtype)
        features /= tf.constant(stddev_rgb,
                                shape=stats_shape,
                                dtype=features.dtype)
        return features

    def build_model():
        """Build model using the model_name given through the command line."""
        model_builder = None
        if FLAGS.model_name.startswith('efficientnet'):
            model_builder = efficientnet_builder
        else:
            raise ValueError('Model must be either efficientnet-b*')

        normalized_features = normalize_features(features,
                                                 model_builder.MEAN_RGB,
                                                 model_builder.STDDEV_RGB)
        logits, _ = model_builder.build_model(normalized_features,
                                              model_name=FLAGS.model_name,
                                              training=is_training,
                                              override_params=override_params,
                                              model_dir=FLAGS.model_dir)
        return logits

    logits = build_model()

    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {
            'classes': tf.argmax(logits, axis=1),
            'probabilities': tf.nn.softmax(logits, name='softmax_tensor')
        }
        return tf.estimator.EstimatorSpec(
            mode=mode,
            predictions=predictions,
            export_outputs={
                'classify': tf.estimator.export.PredictOutput(predictions)
            })

    # Calculate loss, which includes softmax cross entropy and L2 regularization.
    one_hot_labels = tf.one_hot(labels, FLAGS.num_label_classes)
    cross_entropy = tf.losses.softmax_cross_entropy(
        logits=logits,
        onehot_labels=one_hot_labels,
        label_smoothing=FLAGS.label_smoothing)

    # Add weight decay to the loss for non-batch-normalization variables.
    loss = cross_entropy + FLAGS.weight_decay * tf.add_n([
        tf.nn.l2_loss(v) for v in tf.trainable_variables()
        if 'batch_normalization' not in v.name
    ])

    global_step = tf.train.get_global_step()
    if has_moving_average_decay:
        ema = tf.train.ExponentialMovingAverage(
            decay=FLAGS.moving_average_decay, num_updates=global_step)
        ema_vars = utils.get_ema_vars()

    train_op = None
    restore_vars_dict = None
    training_hooks = []
    if is_training:
        # Compute the current epoch and associated learning rate from global_step.
        current_epoch = (tf.cast(global_step, tf.float32) /
                         params['steps_per_epoch'])

        scaled_lr = FLAGS.base_learning_rate * (FLAGS.train_batch_size / 256.0)
        learning_rate = utils.build_learning_rate(scaled_lr, global_step,
                                                  params['steps_per_epoch'])
        optimizer = utils.build_optimizer(learning_rate, optimizer_name='adam')

        # Batch normalization requires UPDATE_OPS to be added as a dependency to
        # the train operation.
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            train_op = optimizer.minimize(loss, global_step)

        if has_moving_average_decay:
            with tf.control_dependencies([train_op]):
                train_op = ema.apply(ema_vars)

        predictions = tf.argmax(logits, axis=1)
        top1_accuray = tf.metrics.accuracy(labels, predictions)
        logging_hook = tf.train.LoggingTensorHook(
            {
                "loss": loss,
                "accuracy": top1_accuray[1],
                "step": global_step
            },
            every_n_iter=1)
        training_hooks.append(logging_hook)

    eval_metrics = None
    if mode == tf.estimator.ModeKeys.EVAL:
        predictions = tf.argmax(logits, axis=1)
        top1_accuray = tf.metrics.accuracy(labels, predictions)
        eval_metrics = {'val_accuracy': top1_accuray}

    num_params = np.sum([np.prod(v.shape) for v in tf.trainable_variables()])
    tf.logging.info('number of trainable parameters: {}'.format(num_params))

    scaffold = None
    if has_moving_average_decay and not is_training:
        # Only apply scaffold for eval jobs.
        saver = tf.train.Saver(restore_vars_dict)
        scaffold = tf.train.Scaffold(saver=saver)

    return tf.estimator.EstimatorSpec(mode=mode,
                                      loss=loss,
                                      train_op=train_op,
                                      training_hooks=training_hooks,
                                      eval_metric_ops=eval_metrics,
                                      scaffold=scaffold)
Beispiel #16
0
def train(args, model_id, tb):
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    train_data = MedicalEasyEnsembleDataloader(args.train_data, args.class_id,
                                               args.batch_size, True,
                                               args.num_workers)
    val_data = MedicalEasyEnsembleDataloader(args.val_data, args.class_id,
                                             args.batch_size, False,
                                             args.num_workers)
    if os.path.exists(args.w2v_file):
        embedding = utils.load_embedding(args.w2v_file,
                                         vocab_size=args.vocab_size,
                                         embedding_size=args.embedding_size)
    else:
        embedding = None
    if args.model_type == 'lstm':
        model = models.LSTMModel(args, embedding)
    elif args.model_type == 'conv':
        model = models.ConvModel(args, embedding)
    elif args.model_type == 'char':
        model = models.CharCNNModel(args, embedding)
    elif args.model_type == 'base':
        model = models.BaseModel(args, embedding)
    else:
        raise NotImplementedError
    if os.path.isfile(
            os.path.join(args.checkpoint_path, str(args.class_id),
                         "%s_%s" % (args.model_type, args.type_suffix),
                         "model_%d.pth" % model_id)):
        print("Load %d class %s type %dth model from previous step" %
              (args.class_id, args.model_type, model_id))
        model.load_state_dict(
            torch.load(
                os.path.join(args.checkpoint_path, str(args.class_id),
                             "%s_%s" % (args.model_type, args.type_suffix),
                             "model_%d.pth" % model_id)))
    iteration = 0
    model = model.cuda(args.device)
    model.train()
    optimizer = utils.build_optimizer(args, model)
    loss_func = MultiBceLoss()
    cur_worse = 1000
    bad_times = 0
    for epoch in range(args.epochs):
        if epoch >= args.start_epoch:
            factor = (epoch - args.start_epoch) // args.decay_every
            decay_factor = args.decay_rate**factor
            current_lr = args.lr * decay_factor
            utils.set_lr(optimizer, current_lr)
        # if epoch != 0 and epoch % args.sample_every == 0:
        #     train_data.re_sample()
        for i, data in enumerate(train_data):
            tmp = [
                _.cuda(args.device) if isinstance(_, torch.Tensor) else _
                for _ in data
            ]
            report_ids, sentence_ids, sentence_lengths, output_vec = tmp
            optimizer.zero_grad()
            loss = loss_func(model(sentence_ids, sentence_lengths), output_vec)
            loss.backward()
            train_loss = loss.item()
            optimizer.step()
            iteration += 1
            if iteration % args.print_every == 0:
                print("iter %d epoch %d loss: %.3f" %
                      (iteration, epoch, train_loss))

            if iteration % args.save_every == 0:
                torch.save(
                    model.state_dict(),
                    os.path.join(args.checkpoint_path, str(args.class_id),
                                 "%s_%s" % (args.model_type, args.type_suffix),
                                 "model_%d.pth" % model_id))
                with open(os.path.join(args.checkpoint_path,
                                       str(args.class_id), "config.json"),
                          'w',
                          encoding='utf-8') as config_f:
                    json.dump(vars(args), config_f, indent=2)
                with open(os.path.join(
                        args.checkpoint_path, str(args.class_id),
                        "%s_%s" % (args.model_type, args.type_suffix),
                        "config.json"),
                          'w',
                          encoding='utf-8') as config_f:
                    json.dump(vars(args), config_f, indent=2)
            if iteration % args.val_every == 0:
                val_loss = eval_model(model, loss_func, val_data, epoch)
                tb.add_scalar("model_%d val_loss" % model_id, val_loss,
                              iteration)
                if val_loss > cur_worse:
                    print("Bad Time Appear")
                    cur_worse = val_loss
                    bad_times += 1
                else:
                    cur_worse = val_loss
                    bad_times = 0
                if bad_times > args.patient:
                    print('Early Stop !!!!')
                    return
            if iteration % args.loss_log_every == 0:
                tb.add_scalar("model_%d train_loss" % model_id, loss.item(),
                              iteration)

    print("The train finished")
def train(dataset, task, args):
    global device

    if task == 'graph':
        # graph classification: separate dataloader for test set
        # shuffle dataset before splitting
        data_size = len(dataset)
        idxs = np.arange(data_size).astype(int)
        np.random.shuffle(idxs)
        idxs = list(idxs)
        dataset = dataset[idxs]

        loader = DataLoader(dataset[:int(data_size * 0.8)],
                            batch_size=args.batch_size,
                            shuffle=True)
        test_loader = DataLoader(dataset[int(data_size * 0.8):],
                                 batch_size=args.batch_size,
                                 shuffle=True)
    elif task == 'node':
        # use mask to split train/validation/test
        test_loader = loader = DataLoader(dataset,
                                          batch_size=args.batch_size,
                                          shuffle=True)
    else:
        raise RuntimeError('Unknown task')

    # build model
    model = models.GNNStack(dataset.num_node_features,
                            args.hidden_dim,
                            dataset.num_classes,
                            args,
                            task=task)
    model = model.to(device)
    print(model)
    scheduler, opt = utils.build_optimizer(args, model.parameters())

    # train
    test_accs = []
    best_acc = 0
    timestr = time.strftime("%Y%m%d-%H%M%S")
    for epoch in range(args.epochs):
        total_loss = 0
        model.train()
        for batch in loader:
            batch = batch.to(device)
            opt.zero_grad()
            pred = model(batch)
            label = batch.y
            if task == 'node':
                pred = pred[batch.train_mask]
                label = label[batch.train_mask]
            loss = model.loss(pred, label)
            loss.backward()
            opt.step()
            total_loss += loss.item() * batch.num_graphs
        total_loss /= len(loader.dataset)
        print(total_loss)

        if epoch % 10 == 0:
            if task == 'graph':
                test_acc = test(test_loader, model)
            else:
                test_acc = test(loader, model, is_validation=True)
            test_accs.append(test_acc)
            print(test_acc, '  test')
            # save best model
            if test_acc > best_acc:
                best_acc = test_acc
            torch.save(model.state_dict(),
                       str(args.model_type) + timestr + '.pt')
            # plot accuracies
            x = range(0, epoch + 1, 10)
            plt.plot(x, test_accs)
            plt.savefig(str(args.model_type) + timestr + '.png')

    print(f'best achieved accuracy: {best_acc}')
    if model.task == 'node':
        best_model = models.GNNStack(dataset.num_node_features,
                                     args.hidden_dim,
                                     dataset.num_classes,
                                     args,
                                     task=task)
        best_model.load_state_dict(
            torch.load(str(args.model_type) + timestr + '.pt'))
        best_model = best_model.to(device)
        test_acc = test(loader, best_model, is_validation=False)
        print(f'test accuracy: {test_acc}')
def train(model, A, X, L, args, normalize_adjacency=False):
    num_nodes = A.shape[0]
    num_train = int(num_nodes * args.train_ratio)
    idx = [i for i in range(num_nodes)]

    np.random.shuffle(idx)
    train_idx = idx[:num_train]
    test_idx = idx[num_train:]

    if normalize_adjacency == True:
        A_ = normalize_A(A)
    else:
        A_ = A

    # add batch dim
    A_ = np.expand_dims(A_, axis=0)
    X_ = np.expand_dims(X, axis=0)
    L_ = np.expand_dims(L, axis=0)

    labels_train = torch.tensor(L_[:, train_idx], dtype=torch.long)
    adj = torch.tensor(A_, dtype=torch.float)
    x = torch.tensor(X_, requires_grad=True, dtype=torch.float)
    scheduler, optimizer = utils.build_optimizer(
        args, model.parameters(), weight_decay=args.weight_decay)
    model.train()

    ypred = None
    for epoch in range(args.num_epochs):
        begin_time = time.time()
        model.zero_grad()
        ypred, adj_att = model(x, adj)
        ypred_train = ypred[:, train_idx, :]
        loss = model.loss(ypred_train, labels_train)
        loss.backward()
        nn.utils.clip_grad_norm(model.parameters(), args.clip)
        optimizer.step()
        elapsed = time.time() - begin_time
        result_train, result_test = evaluate_node(ypred.cpu(), L_, train_idx,
                                                  test_idx)
        if epoch % 10 == 0:
            print(
                "epoch: ",
                epoch,
                "; loss: ",
                loss.item(),
                "; train_acc: ",
                result_train["acc"],
                "; test_acc: ",
                result_test["acc"],
                "; train_prec: ",
                result_train["prec"],
                "; test_prec: ",
                result_test["prec"],
                "; epoch time: ",
                "{0:0.2f}".format(elapsed),
            )

        if scheduler is not None:
            scheduler.step()

    print(result_train["conf_mat"])
    print(result_test["conf_mat"])

    model.eval()
    ypred, _ = model(x, adj)

    save_data = {
        "adj": A_,
        "feat": X_,
        "label": L_,
        "pred": ypred.cpu().detach().numpy(),
        "train_idx": train_idx,
    }

    utils.save_checkpoint(model,
                          optimizer,
                          args,
                          num_epochs=-1,
                          save_data=save_data)
Beispiel #19
0
def model_fn(features, mode, params):
    '''The model_fn to be used with TPUEstimator.

  Args:
    features: `Tensor` of batched images.
    labels: `Tensor` of labels for the data samples
    mode: one of `tf.estimator.ModeKeys.{TRAIN,EVAL,PREDICT}`
    params: `dict` of parameters passed to the model from the TPUEstimator,
        `params['batch_size']` is always provided and should be used as the
        effective batch size.

  Returns:
    A `TPUEstimatorSpec` for the model
  '''
    def preprocess_image(image):
        # In most cases, the default data format NCHW instead of NHWC should be
        # used for a significant performance boost on GPU. NHWC should be used
        # only if the network needs to be run on CPU since the pooling operations
        # are only supported on NHWC. TPU uses XLA compiler to figure out best layout.
        if FLAGS.data_format == 'channels_first':
            assert not FLAGS.transpose_input  # channels_first only for GPU
            image = tf.transpose(image, [0, 3, 1, 2])

        if FLAGS.transpose_input and mode == tf.estimator.ModeKeys.TRAIN:
            image = tf.transpose(image, [3, 0, 1, 2])  # HWCN to NHWC
        return image

    def normalize_image(image):
        # Normalize the image to zero mean and unit variance.
        if FLAGS.data_format == 'channels_first':
            stats_shape = [3, 1, 1]
        else:
            stats_shape = [1, 1, 3]
        mean, std = task_info.get_mean_std(FLAGS.task_name)
        image -= tf.constant(mean, shape=stats_shape, dtype=image.dtype)
        image /= tf.constant(std, shape=stats_shape, dtype=image.dtype)
        return image

    image = features['image']
    image = preprocess_image(image)

    image_shape = image.get_shape().as_list()
    tf.logging.info('image shape: {}'.format(image_shape))
    is_training = (mode == tf.estimator.ModeKeys.TRAIN)

    if mode != tf.estimator.ModeKeys.PREDICT:
        labels = features['label']
    else:
        labels = None

    # If necessary, in the model_fn, use params['batch_size'] instead the batch
    # size flags (--train_batch_size or --eval_batch_size).
    batch_size = params['batch_size']  # pylint: disable=unused-variable

    if FLAGS.unlabel_ratio and is_training:
        unl_bsz = features['unl_probs'].shape[0]
    else:
        unl_bsz = 0

    lab_bsz = image.shape[0] - unl_bsz
    assert lab_bsz == batch_size

    metric_dict = {}
    global_step = tf.train.get_global_step()

    has_moving_average_decay = (FLAGS.moving_average_decay > 0)
    # This is essential, if using a keras-derived model.
    tf.keras.backend.set_learning_phase(is_training)
    tf.logging.info('Using open-source implementation.')
    override_params = {}
    if FLAGS.dropout_rate is not None:
        override_params['dropout_rate'] = FLAGS.dropout_rate
    if FLAGS.stochastic_depth_rate is not None:
        override_params['stochastic_depth_rate'] = FLAGS.stochastic_depth_rate
    if FLAGS.data_format:
        override_params['data_format'] = FLAGS.data_format
    if FLAGS.num_label_classes:
        override_params['num_classes'] = FLAGS.num_label_classes
    if FLAGS.depth_coefficient:
        override_params['depth_coefficient'] = FLAGS.depth_coefficient
    if FLAGS.width_coefficient:
        override_params['width_coefficient'] = FLAGS.width_coefficient

    def build_model(scope=None,
                    reuse=tf.AUTO_REUSE,
                    model_name=None,
                    model_is_training=None,
                    input_image=None,
                    use_adv_bn=False,
                    is_teacher=False):
        model_name = model_name or FLAGS.model_name
        if model_is_training is None:
            model_is_training = is_training
        if input_image is None:
            input_image = image
        input_image = normalize_image(input_image)

        scope_model_name = model_name

        if scope:
            scope = scope + '/'
        else:
            scope = ''
        with tf.variable_scope(scope + scope_model_name, reuse=reuse):
            if model_name.startswith('efficientnet'):
                logits, _ = efficientnet_builder.build_model(
                    input_image,
                    model_name=model_name,
                    training=model_is_training,
                    override_params=override_params,
                    model_dir=FLAGS.model_dir,
                    use_adv_bn=use_adv_bn,
                    is_teacher=is_teacher)
            else:
                assert False, 'model {} not implemented'.format(model_name)
        return logits

    if params['use_bfloat16']:
        with tf.tpu.bfloat16_scope():
            logits = tf.cast(build_model(), tf.float32)
    else:
        logits = build_model()

    if FLAGS.teacher_model_name:
        teacher_image = preprocess_image(features['teacher_image'])
        if params['use_bfloat16']:
            with tf.tpu.bfloat16_scope():
                teacher_logits = tf.cast(
                    build_model(scope='teacher_model',
                                model_name=FLAGS.teacher_model_name,
                                model_is_training=False,
                                input_image=teacher_image,
                                is_teacher=True), tf.float32)
        else:
            teacher_logits = build_model(scope='teacher_model',
                                         model_name=FLAGS.teacher_model_name,
                                         model_is_training=False,
                                         input_image=teacher_image,
                                         is_teacher=True)
        teacher_logits = tf.stop_gradient(teacher_logits)
        if FLAGS.teacher_softmax_temp != -1:
            teacher_prob = tf.nn.softmax(teacher_logits /
                                         FLAGS.teacher_softmax_temp)
        else:
            teacher_prob = None
            teacher_one_hot_pred = tf.argmax(teacher_logits,
                                             axis=1,
                                             output_type=labels.dtype)

    if mode == tf.estimator.ModeKeys.PREDICT:
        if has_moving_average_decay:
            ema = tf.train.ExponentialMovingAverage(
                decay=FLAGS.moving_average_decay)
            ema_vars = utils.get_all_variable()
            restore_vars_dict = ema.variables_to_restore(ema_vars)
            tf.logging.info(
                'restored variables:\n%s',
                json.dumps(sorted(restore_vars_dict.keys()), indent=4))

        predictions = {
            'classes': tf.argmax(logits, axis=1),
            'probabilities': tf.nn.softmax(logits, name='softmax_tensor')
        }
        return tf.estimator.tpu.TPUEstimatorSpec(
            mode=mode,
            predictions=predictions,
            scaffold_fn=functools.partial(_scaffold_fn,
                                          restore_vars_dict=restore_vars_dict)
            if has_moving_average_decay else None)

    if has_moving_average_decay:
        ema_step = global_step
        ema = tf.train.ExponentialMovingAverage(
            decay=FLAGS.moving_average_decay, num_updates=ema_step)
        ema_vars = utils.get_all_variable()

    lab_labels = labels[:lab_bsz]
    lab_logits = logits[:lab_bsz]
    lab_pred = tf.argmax(lab_logits, axis=-1, output_type=labels.dtype)
    lab_prob = tf.nn.softmax(lab_logits)
    lab_acc = tf.to_float(tf.equal(lab_pred, lab_labels))
    metric_dict['lab/acc'] = tf.reduce_mean(lab_acc)
    metric_dict['lab/pred_prob'] = tf.reduce_mean(
        tf.reduce_max(lab_prob, axis=-1))
    one_hot_labels = tf.one_hot(lab_labels, FLAGS.num_label_classes)

    if FLAGS.unlabel_ratio:
        unl_labels = labels[lab_bsz:]
        unl_logits = logits[lab_bsz:]
        unl_pred = tf.argmax(unl_logits, axis=-1, output_type=labels.dtype)
        unl_prob = tf.nn.softmax(unl_logits)
        unl_acc = tf.to_float(tf.equal(unl_pred, unl_labels))
        metric_dict['unl/acc_to_dump'] = tf.reduce_mean(unl_acc)
        metric_dict['unl/pred_prob'] = tf.reduce_mean(
            tf.reduce_max(unl_prob, axis=-1))

    # compute lab_loss
    one_hot_labels = tf.one_hot(lab_labels, FLAGS.num_label_classes)
    lab_loss = tf.losses.softmax_cross_entropy(
        logits=lab_logits,
        onehot_labels=one_hot_labels,
        label_smoothing=FLAGS.label_smoothing,
        reduction=tf.losses.Reduction.NONE)
    if FLAGS.label_data_sample_prob != 1:
        # mask out part of the labeled data
        random_mask = tf.floor(
            FLAGS.label_data_sample_prob +
            tf.random_uniform(tf.shape(lab_loss), dtype=lab_loss.dtype))
        lab_loss = tf.reduce_mean(lab_loss * random_mask)
    else:
        lab_loss = tf.reduce_mean(lab_loss)
    metric_dict['lab/loss'] = lab_loss

    if FLAGS.unlabel_ratio:
        if FLAGS.teacher_softmax_temp == -1:  # Hard labels
            # Get one-hot labels
            if FLAGS.teacher_model_name:
                ext_teacher_pred = teacher_one_hot_pred[lab_bsz:]
                one_hot_labels = tf.one_hot(ext_teacher_pred,
                                            FLAGS.num_label_classes)
            else:
                one_hot_labels = tf.one_hot(unl_labels,
                                            FLAGS.num_label_classes)
            # Compute cross entropy
            unl_loss = tf.losses.softmax_cross_entropy(
                logits=unl_logits,
                onehot_labels=one_hot_labels,
                label_smoothing=FLAGS.label_smoothing)
        else:  # Soft labels
            # Get teacher prob
            if FLAGS.teacher_model_name:
                unl_teacher_prob = teacher_prob[lab_bsz:]
            else:
                scaled_prob = tf.pow(features['unl_probs'],
                                     1 / FLAGS.teacher_softmax_temp)
                unl_teacher_prob = scaled_prob / tf.reduce_sum(
                    scaled_prob, axis=-1, keepdims=True)
            metric_dict['unl/target_prob'] = tf.reduce_mean(
                tf.reduce_max(unl_teacher_prob, axis=-1))
            unl_loss = cross_entropy(unl_teacher_prob,
                                     unl_logits,
                                     return_mean=True)

        metric_dict['ext/loss'] = unl_loss
    else:
        unl_loss = 0

    real_lab_bsz = tf.to_float(lab_bsz) * FLAGS.label_data_sample_prob
    real_unl_bsz = batch_size * FLAGS.label_data_sample_prob * FLAGS.unlabel_ratio
    data_loss = lab_loss * real_lab_bsz + unl_loss * real_unl_bsz
    data_loss = data_loss / real_lab_bsz

    # Add weight decay to the loss for non-batch-normalization variables.
    loss = data_loss + FLAGS.weight_decay * tf.add_n([
        tf.nn.l2_loss(v) for v in tf.trainable_variables()
        if 'batch_normalization' not in v.name
    ])
    metric_dict['train/data_loss'] = data_loss
    metric_dict['train/loss'] = loss

    host_call = None
    restore_vars_dict = None

    if is_training:
        # Compute the current epoch and associated learning rate from global_step.
        current_epoch = (tf.cast(global_step, tf.float32) /
                         params['steps_per_epoch'])
        real_train_batch_size = FLAGS.train_batch_size
        real_train_batch_size *= FLAGS.label_data_sample_prob
        scaled_lr = FLAGS.base_learning_rate * (real_train_batch_size / 256.0)
        if FLAGS.final_base_lr:
            # total number of training epochs
            total_epochs = FLAGS.train_steps * FLAGS.train_batch_size * 1. / FLAGS.num_train_images - 5
            decay_times = math.log(FLAGS.final_base_lr /
                                   FLAGS.base_learning_rate) / math.log(0.97)
            decay_epochs = total_epochs / decay_times
            tf.logging.info(
                'setting decay_epochs to {:.2f}'.format(decay_epochs) +
                '\n' * 3)
        else:
            decay_epochs = 2.4 * FLAGS.train_ratio
        learning_rate = utils.build_learning_rate(
            scaled_lr,
            global_step,
            params['steps_per_epoch'],
            decay_epochs=decay_epochs,
            start_from_step=FLAGS.train_steps - FLAGS.train_last_step_num,
            warmup_epochs=5,
        )
        metric_dict['train/lr'] = learning_rate
        metric_dict['train/epoch'] = current_epoch
        optimizer = utils.build_optimizer(learning_rate)
        if FLAGS.use_tpu:
            # When using TPU, wrap the optimizer with CrossShardOptimizer which
            # handles synchronization details between different TPU cores. To the
            # user, this should look like regular synchronous training.
            optimizer = tf.tpu.CrossShardOptimizer(optimizer)

        # Batch normalization requires UPDATE_OPS to be added as a dependency to
        # the train operation.
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        tvars = tf.trainable_variables()
        g_vars = []
        tvars = sorted(tvars, key=lambda var: var.name)
        for var in tvars:
            if 'teacher_model' not in var.name:
                g_vars += [var]
        with tf.control_dependencies(update_ops):
            train_op = optimizer.minimize(loss, global_step, var_list=g_vars)

        if has_moving_average_decay:
            with tf.control_dependencies([train_op]):
                train_op = ema.apply(ema_vars)

        if not FLAGS.skip_host_call:
            host_call = utils.construct_scalar_host_call(metric_dict)
        scaffold_fn = None
        if FLAGS.teacher_model_name or FLAGS.init_model:
            scaffold_fn = utils.init_from_ckpt(scaffold_fn)
    else:
        train_op = None
        if has_moving_average_decay:
            # Load moving average variables for eval.
            restore_vars_dict = ema.variables_to_restore(ema_vars)

    eval_metrics = None
    if mode == tf.estimator.ModeKeys.EVAL:
        scaffold_fn = functools.partial(_scaffold_fn,
                                        restore_vars_dict=restore_vars_dict
                                        ) if has_moving_average_decay else None

        def metric_fn(labels, logits):
            '''Evaluation metric function. Evaluates accuracy.

      This function is executed on the CPU and should not directly reference
      any Tensors in the rest of the `model_fn`. To pass Tensors from the model
      to the `metric_fn`, provide as part of the `eval_metrics`. See
      https://www.tensorflow.org/api_docs/python/tf/contrib/tpu/TPUEstimatorSpec
      for more information.

      Arguments should match the list of `Tensor` objects passed as the second
      element in the tuple passed to `eval_metrics`.

      Args:
        labels: `Tensor` with shape `[batch]`.
        logits: `Tensor` with shape `[batch, num_classes]`.

      Returns:
        A dict of the metrics to return from evaluation.
      '''

            predictions = tf.argmax(logits, axis=1)
            top_1_accuracy = tf.metrics.accuracy(labels, predictions)
            in_top_5 = tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32)
            top_5_accuracy = tf.metrics.mean(in_top_5)

            result_dict = {
                'top_1_accuracy': top_1_accuracy,
                'top_5_accuracy': top_5_accuracy,
            }

            return result_dict

        eval_metrics = (metric_fn, [labels, logits])

    num_params = np.sum([np.prod(v.shape) for v in tf.trainable_variables()])
    tf.logging.info('number of trainable parameters: {}'.format(num_params))

    return tf.estimator.tpu.TPUEstimatorSpec(mode=mode,
                                             loss=loss,
                                             train_op=train_op,
                                             host_call=host_call,
                                             eval_metrics=eval_metrics,
                                             scaffold_fn=scaffold_fn)
Beispiel #20
0
def mnasnet_model_fn(features, labels, mode, params):
    """The model_fn for MnasNet to be used with TPUEstimator.

  Args:
    features: `Tensor` of batched images.
    labels: `Tensor` of labels for the data samples
    mode: one of `tf.estimator.ModeKeys.{TRAIN,EVAL,PREDICT}`
    params: `dict` of parameters passed to the model from the TPUEstimator,
      `params['batch_size']` is always provided and should be used as the
      effective batch size.

  Returns:
    A `TPUEstimatorSpec` for the model
  """
    is_training = (mode == tf.estimator.ModeKeys.TRAIN)
    # This is essential, if using a keras-derived model.
    K.set_learning_phase(is_training)

    if isinstance(features, dict):
        features = features['feature']

    if mode == tf.estimator.ModeKeys.PREDICT:
        # Adds an identify node to help TFLite export.
        features = tf.identity(features, 'float_image_input')

    # In most cases, the default data format NCHW instead of NHWC should be
    # used for a significant performance boost on GPU. NHWC should be used
    # only if the network needs to be run on CPU since the pooling operations
    # are only supported on NHWC. TPU uses XLA compiler to figure out best layout.
    if params['data_format'] == 'channels_first':
        assert not params['transpose_input']  # channels_first only for GPU
        features = tf.transpose(features, [0, 3, 1, 2])
        stats_shape = [3, 1, 1]
    else:
        stats_shape = [1, 1, 3]

    if params['transpose_input'] and mode != tf.estimator.ModeKeys.PREDICT:
        features = tf.transpose(features, [3, 0, 1, 2])  # HWCN to NHWC

    # Normalize the image to zero mean and unit variance.
    features -= tf.constant(imagenet_input.MEAN_RGB,
                            shape=stats_shape,
                            dtype=features.dtype)
    features /= tf.constant(imagenet_input.STDDEV_RGB,
                            shape=stats_shape,
                            dtype=features.dtype)

    has_moving_average_decay = (params['moving_average_decay'] > 0)

    tf.logging.info('Using open-source implementation for MnasNet definition.')
    override_params = {}
    if params['batch_norm_momentum']:
        override_params['batch_norm_momentum'] = params['batch_norm_momentum']
    if params['batch_norm_epsilon']:
        override_params['batch_norm_epsilon'] = params['batch_norm_epsilon']
    if params['dropout_rate']:
        override_params['dropout_rate'] = params['dropout_rate']
    if params['data_format']:
        override_params['data_format'] = params['data_format']
    if params['num_label_classes']:
        override_params['num_classes'] = params['num_label_classes']
    if params['depth_multiplier']:
        override_params['depth_multiplier'] = params['depth_multiplier']
    if params['depth_divisor']:
        override_params['depth_divisor'] = params['depth_divisor']
    if params['min_depth']:
        override_params['min_depth'] = params['min_depth']
    override_params['use_keras'] = params['use_keras']

    if params['precision'] == 'bfloat16':
        with tf.contrib.tpu.bfloat16_scope():
            logits, _ = mnasnet_models.build_mnasnet_model(
                features,
                model_name=params['model_name'],
                training=is_training,
                override_params=override_params)
        logits = tf.cast(logits, tf.float32)
    else:  # params['precision'] == 'float32'
        logits, _ = mnasnet_models.build_mnasnet_model(
            features,
            model_name=params['model_name'],
            training=is_training,
            override_params=override_params)

    if params['quantized_training']:
        if is_training:
            tf.logging.info('Adding fake quantization ops for training.')
            tf.contrib.quantize.create_training_graph(
                quant_delay=int(params['steps_per_epoch'] *
                                FLAGS.quantization_delay_epochs))
        else:
            tf.logging.info('Adding fake quantization ops for evaluation.')
            tf.contrib.quantize.create_eval_graph()

    if mode == tf.estimator.ModeKeys.PREDICT:
        scaffold_fn = None
        if FLAGS.export_moving_average:
            # If the model is trained with moving average decay, to match evaluation
            # metrics, we need to export the model using moving average variables.
            restore_checkpoint = tf.train.latest_checkpoint(FLAGS.model_dir)
            variables_to_restore = get_pretrained_variables_to_restore(
                restore_checkpoint, load_moving_average=True)
            tf.logging.info('Restoring from the latest checkpoint: %s',
                            restore_checkpoint)
            tf.logging.info(str(variables_to_restore))

            def restore_scaffold():
                saver = tf.train.Saver(variables_to_restore)
                return tf.train.Scaffold(saver=saver)

            scaffold_fn = restore_scaffold

        predictions = {
            'classes': tf.argmax(logits, axis=1),
            'probabilities': tf.nn.softmax(logits, name='softmax_tensor')
        }
        return tf.contrib.tpu.TPUEstimatorSpec(
            mode=mode,
            predictions=predictions,
            export_outputs={
                'classify': tf.estimator.export.PredictOutput(predictions)
            },
            scaffold_fn=scaffold_fn)

    # If necessary, in the model_fn, use params['batch_size'] instead the batch
    # size flags (--train_batch_size or --eval_batch_size).
    batch_size = params['batch_size']  # pylint: disable=unused-variable

    # Calculate loss, which includes softmax cross entropy and L2 regularization.
    one_hot_labels = tf.one_hot(labels, params['num_label_classes'])
    cross_entropy = tf.losses.softmax_cross_entropy(
        logits=logits,
        onehot_labels=one_hot_labels,
        label_smoothing=params['label_smoothing'])

    # Add weight decay to the loss for non-batch-normalization variables.
    loss = cross_entropy + params['weight_decay'] * tf.add_n([
        tf.nn.l2_loss(v) for v in tf.trainable_variables()
        if 'batch_normalization' not in v.name
    ])

    global_step = tf.train.get_global_step()
    if has_moving_average_decay:
        ema = tf.train.ExponentialMovingAverage(
            decay=params['moving_average_decay'], num_updates=global_step)
        ema_vars = utils.get_ema_vars()

    host_call = None
    if is_training:
        # Compute the current epoch and associated learning rate from global_step.
        current_epoch = (tf.cast(global_step, tf.float32) /
                         params['steps_per_epoch'])

        scaled_lr = params['base_learning_rate'] * (params['train_batch_size'] / 256.0)  # pylint: disable=line-too-long
        learning_rate = utils.build_learning_rate(scaled_lr, global_step,
                                                  params['steps_per_epoch'])
        optimizer = utils.build_optimizer(learning_rate)
        if params['use_tpu']:
            # When using TPU, wrap the optimizer with CrossShardOptimizer which
            # handles synchronization details between different TPU cores. To the
            # user, this should look like regular synchronous training.
            optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)

        # Batch normalization requires UPDATE_OPS to be added as a dependency to
        # the train operation.
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            train_op = optimizer.minimize(loss, global_step)

        if has_moving_average_decay:
            with tf.control_dependencies([train_op]):
                train_op = ema.apply(ema_vars)

        if not params['skip_host_call']:

            def host_call_fn(gs, loss, lr, ce):
                """Training host call.

        Creates scalar summaries for training metrics.

        This function is executed on the CPU and should not directly reference
        any Tensors in the rest of the `model_fn`. To pass Tensors from the
        model to the `metric_fn`, provide as part of the `host_call`. See
        https://www.tensorflow.org/api_docs/python/tf/contrib/tpu/TPUEstimatorSpec
        for more information.

        Arguments should match the list of `Tensor` objects passed as the second
        element in the tuple passed to `host_call`.

        Args:
          gs: `Tensor with shape `[batch]` for the global_step
          loss: `Tensor` with shape `[batch]` for the training loss.
          lr: `Tensor` with shape `[batch]` for the learning_rate.
          ce: `Tensor` with shape `[batch]` for the current_epoch.

        Returns:
          List of summary ops to run on the CPU host.
        """
                gs = gs[0]
                # Host call fns are executed params['iterations_per_loop'] times after
                # one TPU loop is finished, setting max_queue value to the same as
                # number of iterations will make the summary writer only flush the
                # data to storage once per loop.
                with tf.contrib.summary.create_file_writer(
                        FLAGS.model_dir,
                        max_queue=params['iterations_per_loop']).as_default():
                    with tf.contrib.summary.always_record_summaries():
                        tf.contrib.summary.scalar('loss', loss[0], step=gs)
                        tf.contrib.summary.scalar('learning_rate',
                                                  lr[0],
                                                  step=gs)
                        tf.contrib.summary.scalar('current_epoch',
                                                  ce[0],
                                                  step=gs)

                        return tf.contrib.summary.all_summary_ops()

            # To log the loss, current learning rate, and epoch for Tensorboard, the
            # summary op needs to be run on the host CPU via host_call. host_call
            # expects [batch_size, ...] Tensors, thus reshape to introduce a batch
            # dimension. These Tensors are implicitly concatenated to
            # [params['batch_size']].
            gs_t = tf.reshape(global_step, [1])
            loss_t = tf.reshape(loss, [1])
            lr_t = tf.reshape(learning_rate, [1])
            ce_t = tf.reshape(current_epoch, [1])

            host_call = (host_call_fn, [gs_t, loss_t, lr_t, ce_t])

    else:
        train_op = None

    eval_metrics = None
    if mode == tf.estimator.ModeKeys.EVAL:

        def metric_fn(labels, logits):
            """Evaluation metric function.

      Evaluates accuracy.

      This function is executed on the CPU and should not directly reference
      any Tensors in the rest of the `model_fn`. To pass Tensors from the model
      to the `metric_fn`, provide as part of the `eval_metrics`. See
      https://www.tensorflow.org/api_docs/python/tf/contrib/tpu/TPUEstimatorSpec
      for more information.

      Arguments should match the list of `Tensor` objects passed as the second
      element in the tuple passed to `eval_metrics`.

      Args:
        labels: `Tensor` with shape `[batch]`.
        logits: `Tensor` with shape `[batch, num_classes]`.

      Returns:
        A dict of the metrics to return from evaluation.
      """
            predictions = tf.argmax(logits, axis=1)
            top_1_accuracy = tf.metrics.accuracy(labels, predictions)
            in_top_5 = tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32)
            top_5_accuracy = tf.metrics.mean(in_top_5)

            return {
                'top_1_accuracy': top_1_accuracy,
                'top_5_accuracy': top_5_accuracy,
            }

        eval_metrics = (metric_fn, [labels, logits])

    num_params = np.sum([np.prod(v.shape) for v in tf.trainable_variables()])
    tf.logging.info('number of trainable parameters: {}'.format(num_params))

    # Prepares scaffold_fn if needed.
    scaffold_fn = None
    if is_training and FLAGS.init_checkpoint:
        variables_to_restore = get_pretrained_variables_to_restore(
            FLAGS.init_checkpoint, has_moving_average_decay)
        tf.logging.info('Initializing from pretrained checkpoint: %s',
                        FLAGS.init_checkpoint)
        if FLAGS.use_tpu:

            def init_scaffold():
                tf.train.init_from_checkpoint(FLAGS.init_checkpoint,
                                              variables_to_restore)
                return tf.train.Scaffold()

            scaffold_fn = init_scaffold
        else:
            tf.train.init_from_checkpoint(FLAGS.init_checkpoint,
                                          variables_to_restore)

    restore_vars_dict = None
    if not is_training and has_moving_average_decay:
        # Load moving average variables for eval.
        restore_vars_dict = ema.variables_to_restore(ema_vars)

        def eval_scaffold():
            saver = tf.train.Saver(restore_vars_dict)
            return tf.train.Scaffold(saver=saver)

        scaffold_fn = eval_scaffold

    return tf.contrib.tpu.TPUEstimatorSpec(mode=mode,
                                           loss=loss,
                                           train_op=train_op,
                                           host_call=host_call,
                                           eval_metrics=eval_metrics,
                                           scaffold_fn=scaffold_fn)
Beispiel #21
0
                                  dropout=0.2,
                                  tok2id=tok2id)
if CUDA:
    model = model.cuda()

model_parameters = filter(lambda p: p.requires_grad, model.parameters())
params = sum([np.prod(p.size()) for p in model_parameters])
print('NUM PARAMS: ', params)

# # # # # # # # ## # # # ## # # OPTIMIZER, LOSS # # # # # # # # ## # # # ## # #

num_train_steps = (num_train_examples * 40)
if ARGS.pretrain_data:
    num_train_steps += (num_pretrain_examples * ARGS.pretrain_epochs)

optimizer = utils.build_optimizer(model, num_train_steps)

loss_fn, cross_entropy_loss = utils.build_loss_fn(vocab_size=len(tok2id))

writer = SummaryWriter(ARGS.working_dir)

# # # # # # # # # # # PRETRAINING (optional) # # # # # # # # # # # # # # # #
if ARGS.pretrain_data:
    print('PRETRAINING...')
    for epoch in range(ARGS.pretrain_epochs):
        model.train()
        losses = utils.train_for_epoch(
            model,
            pretrain_dataloader,
            tok2id,
            optimizer,
Beispiel #22
0
def train(args, features, weights, edges, num_features):
    """
    args - args from command line
    features - a filepath with each line being a space-delimited string of [node ID, [features], label name]
    weights - array of len(classes) indicating weight of each class when computing loss.
        Higher weight should be assigned to less common classes.
        0 means to ignore a class.
    edges - a filepath of the (directed) edges file (each line being "n1 n2" representing n1 -> n2)
    num_features - number of computed features.
    """
    # For reproducibility
    torch.manual_seed(1)
    np.random.seed(1)
    random.seed(1)

    # Load the data
    x, y, feat_data, labels, adj_list = load_dataset(args, features, edges,
                                                     num_features)
    print("Loaded dataset")

    # Define embeddings for each node to be used in aggregation (FEATURES DON'T CHANGE)
    features = nn.Embedding(NUM_NODES, num_features)
    features.weight = nn.Parameter(torch.FloatTensor(feat_data),
                                   requires_grad=False)

    # build model
    model = models.createGNN(args, features, adj_list, num_features, weights)

    # Train loop
    print("Starting training")
    f1_test = []
    accuracy_test = []
    auc_test = []
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)
    for train_index, test_index in skf.split(x, y):
        train, test = x[train_index], x[test_index]
        total_loss = 0

        # TODO should this be outside the loop?
        _, opt = utils.build_optimizer(args, model.parameters())

        for batch in range(1000):
            model.train()
            batch_nodes = train[:args.batch_size]
            train = np.roll(
                train, args.batch_size)  # Prepare train set for next batch.

            opt.zero_grad()
            loss = model.loss(
                batch_nodes,
                Variable(torch.LongTensor(labels[np.array(batch_nodes)])))
            loss.backward()
            opt.step()
            total_loss += loss.data.item()

            if batch % 50 == 0:
                model.eval()
                val_output = model(test)
                labels_pred_validation = val_output.data.numpy().argmax(axis=1)
                labels_true_validation = labels[test].flatten()
                if args.dataset == "hate":
                    y_true = [
                        1 if v == 2 else 0 for v in labels_true_validation
                    ]  # label 2 is hate
                    y_pred = [
                        1 if v == 2 else 0 for v in labels_pred_validation
                    ]
                else:
                    y_true = [
                        1 if v == 1 else 0 for v in labels_true_validation
                    ]  # label 1 is suspended
                    y_pred = [
                        1 if v == 1 else 0 for v in labels_pred_validation
                    ]

                fscore = f1_score(y_true,
                                  y_pred,
                                  labels=None,
                                  pos_label=1,
                                  average='binary',
                                  sample_weight=None)
                recall = recall_score(y_true,
                                      y_pred,
                                      labels=None,
                                      pos_label=1,
                                      average='binary',
                                      sample_weight=None)
                print(confusion_matrix(y_true, y_pred))
                print('F1: {}, Recall: {}'.format(fscore, recall))

                # If we ever reach really good scores...
                if fscore > 0.70 and args.dataset == "hate":
                    break
                if fscore > 0.60 and recall > 0.8 and args.dataset != "hate":
                    break

        # TODO decompose test code?
        # For each split, evaluate AUC, accuracy, and F1 on test split.
        model.eval()
        val_output = model(test)
        if args.dataset == "hate":
            # Prediction score is the difference between hateful and non-hateful scores.
            labels_pred_score = val_output.data.numpy()[:, 2].flatten(
            ) - val_output.data.numpy()[:, 0].flatten()
        else:
            # Prediction score is the difference between suspended and active scores.
            labels_pred_score = val_output.data.numpy()[:, 1].flatten(
            ) - val_output.data.numpy()[:, 0].flatten()

        labels_true_test = labels[test].flatten()
        if args.dataset == "hate":
            y_true = [1 if v == 2 else 0 for v in labels_true_test]
        else:
            y_true = [1 if v == 1 else 0 for v in labels_true_test]

        fpr, tpr, _ = roc_curve(y_true, labels_pred_score)

        # TODO why is it different inside the training loop?
        # Prediction is the larger of the two hateful/non-hateful or suspended/active scores.
        labels_pred_test = labels_pred_score > 0
        y_pred = [1 if v else 0 for v in labels_pred_test]

        auc_test.append(auc(fpr, tpr))
        accuracy_test.append(accuracy_score(y_true, y_pred))
        f1_test.append(f1_score(y_true, y_pred))

    # Print out final accuracy, F1, AUC results.
    accuracy_test = np.array(accuracy_test)
    f1_test = np.array(f1_test)
    auc_test = np.array(auc_test)

    print("Accuracy   %0.4f +-  %0.4f" %
          (accuracy_test.mean(), accuracy_test.std()))
    print("F1    %0.4f +-  %0.4f" % (f1_test.mean(), f1_test.std()))
    print("AUC    %0.4f +-  %0.4f" % (auc_test.mean(), auc_test.std()))
Beispiel #23
0
def train(dataset, task, args):
    test_epoch, test_acc_per_epoch = [], []

    if task == 'graph':
        # graph classification: separate dataloader for test set
        data_size = len(dataset)
        dataset.shuffle()
        loader = DataLoader(dataset[:int(data_size * 0.8)],
                            batch_size=args.batch_size,
                            shuffle=True)
        test_loader = DataLoader(dataset[int(data_size * 0.8):],
                                 batch_size=args.batch_size,
                                 shuffle=True)
    elif task == 'node':
        # use mask to split train/validation/test
        test_loader = loader = DataLoader(dataset,
                                          batch_size=args.batch_size,
                                          shuffle=True)
    else:
        raise RuntimeError('Unknown task')

    # build model
    model = models.GNNStack(dataset.num_node_features,
                            args.hidden_dim,
                            dataset.num_classes,
                            args,
                            task=task)
    scheduler, opt = utils.build_optimizer(args, model.parameters())

    # train
    for epoch in range(args.epochs):
        total_loss = 0
        total_acc = 0
        model.train()
        for batch in loader:
            opt.zero_grad()
            pred = model(batch)
            label = batch.y
            if task == 'node':
                pred = pred[batch.train_mask]
                label = label[batch.train_mask]
            loss = model.loss(pred, label)
            loss.backward()
            opt.step()
            total_loss += loss.item() * batch.num_graphs
            total_acc += pred.max(dim=1)[1].eq(label).float().sum().item()
        total_loss /= len(loader.dataset)
        total_acc /= len(loader.dataset)
        # print(total_loss)

        if epoch % 1 == 0:
            test_acc = test(loader, model)
            print(
                f'epoch {epoch}: train loss - {total_loss:.4f}, train acc - {total_acc:.2%}, test acc - {test_acc:.2%}'
            )
            test_epoch.append(epoch)
            test_acc_per_epoch.append(test_acc)
    f, ax = plt.subplots(1, 1)
    ax.plot(np.array(test_epoch), np.array(test_acc_per_epoch))
    ax.set_title(f'{dataset.name} - {args.model_type}')
    ax.set_xlabel('epochs')
    ax.set_ylabel('accuracy')
    f.savefig(f'{dataset.name}_{args.model_type}.png',
              bbox_inches='tight',
              dpi=400)
    def train(options):
        lex_dict = None
        if options.dict_path is not None:
            lex_dict = get_lex_dict(options.dict_path)
        if not os.path.exists(options.model_path):
            os.makedirs(options.model_path)

        text_processor = TextProcessor(options.tokenizer_path)
        assert text_processor.pad_token_id() == 0

        image_captioner = Seq2Seq.load(ImageCaptioning,
                                       options.pretrained_path,
                                       tok_dir=options.tokenizer_path)
        txt2ImageModel = Caption2Image(
            text_processor=text_processor,
            enc_layer=options.encoder_layer,
            embed_dim=options.embed_dim,
            intermediate_dim=options.intermediate_layer_dim)

        print("Model initialization done!")

        # We assume that the collator function returns a list with the size of number of gpus (in case of cpus,
        collator = dataset.ImageTextCollator()
        num_batches = max(1, torch.cuda.device_count())

        optimizer = build_optimizer(txt2ImageModel,
                                    options.learning_rate,
                                    warump_steps=options.warmup)

        trainer = Caption2ImageTrainer(
            model=txt2ImageModel,
            caption_model=image_captioner,
            mask_prob=options.mask_prob,
            optimizer=optimizer,
            clip=options.clip,
            beam_width=options.beam_width,
            max_len_a=options.max_len_a,
            max_len_b=options.max_len_b,
            len_penalty_ratio=options.len_penalty_ratio,
            fp16=options.fp16,
            mm_mode=options.mm_mode)

        pin_memory = torch.cuda.is_available()
        img_train_loader = ImageMTTrainer.get_img_loader(
            collator,
            dataset.ImageCaptionDatasetwNegSamples,
            options.train_path,
            txt2ImageModel,
            num_batches,
            options,
            pin_memory,
            lex_dict=lex_dict)

        img_dev_loader = ImageMTTrainer.get_img_loader(
            collator,
            dataset.ImageCaptionDatasetwNegSamples,
            options.dev_path,
            txt2ImageModel,
            num_batches,
            options,
            pin_memory,
            lex_dict=lex_dict,
            shuffle=False,
            denom=2)

        step, train_epoch = 0, 1
        while options.step > 0 and step < options.step:
            print("train epoch", train_epoch)
            step = trainer.train_epoch(img_data_iter=img_train_loader,
                                       img_dev_data_iter=img_dev_loader,
                                       max_step=options.step,
                                       lex_dict=lex_dict,
                                       saving_path=options.model_path,
                                       step=step)
            train_epoch += 1
Beispiel #25
0
def model_fn(features, labels, mode, params):
    """The model_fn to be used with TPUEstimator.

  Args:
    features: A dict of `Tensor` of batched images and other features.
    labels: a Tensor or a dict of Tensor representing the batched labels.
    mode: one of `tf.estimator.ModeKeys.{TRAIN,EVAL,PREDICT}`
    params: `dict` of parameters passed to the model from the TPUEstimator,
      `params['batch_size']` is always provided and should be used as the
      effective batch size.

  Returns:
    A `TPUEstimatorSpec` for the model
  """
    logging.info('params=%s', params)
    images = features['image'] if isinstance(features, dict) else features
    labels = labels['label'] if isinstance(labels, dict) else labels
    config = params['config']
    image_size = params['image_size']
    utils.scalar('model/resolution', image_size)

    if config.model.data_format == 'channels_first':
        images = tf.transpose(images, [0, 3, 1, 2])

    is_training = (mode == tf.estimator.ModeKeys.TRAIN)
    has_moving_average_decay = (config.train.ema_decay > 0)
    if FLAGS.use_tpu and not config.model.bn_type:
        config.model.bn_type = 'tpu_bn'
    # This is essential, if using a keras-derived model.
    tf.keras.backend.set_learning_phase(is_training)

    def build_model(in_images):
        """Build model using the model_name given through the command line."""
        config.model.num_classes = config.data.num_classes
        model = effnetv2_model.EffNetV2Model(config.model.model_name,
                                             config.model)
        logits = model(in_images, training=is_training)[0]
        return logits

    pre_num_params, pre_num_flops = utils.num_params_flops(
        readable_format=True)

    if config.runtime.mixed_precision:
        precision = 'mixed_bfloat16' if FLAGS.use_tpu else 'mixed_float16'
        logits = utils.build_model_with_precision(precision, build_model,
                                                  images, is_training)
        logits = tf.cast(logits, tf.float32)
    else:
        logits = build_model(images)

    num_params, num_flops = utils.num_params_flops(readable_format=True)
    num_params = num_params - pre_num_params
    num_flops = (num_flops - pre_num_flops) / params['batch_size']
    logging.info('backbone params/flops = %.4f M / %.4f B', num_params,
                 num_flops)
    utils.scalar('model/params', num_params)
    utils.scalar('model/flops', num_flops)

    # Calculate loss, which includes softmax cross entropy and L2 regularization.
    if config.train.loss_type == 'sigmoid':
        cross_entropy = tf.losses.sigmoid_cross_entropy(
            multi_class_labels=tf.cast(labels, dtype=logits.dtype),
            logits=logits,
            label_smoothing=config.train.label_smoothing)
    elif config.train.loss_type == 'custom':
        xent = tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.cast(
            labels, dtype=logits.dtype),
                                                       logits=logits)
        cross_entropy = tf.reduce_mean(tf.reduce_sum(xent, axis=-1))
    else:
        if config.data.multiclass:
            logging.info('use multi-class loss: %s', config.data.multiclass)
            labels /= tf.reshape(tf.reduce_sum(labels, axis=1), (-1, 1))
        cross_entropy = tf.losses.softmax_cross_entropy(
            onehot_labels=labels,
            logits=logits,
            label_smoothing=config.train.label_smoothing)

    train_steps = max(config.train.min_steps,
                      config.train.epochs * params['steps_per_epoch'])
    global_step = tf.train.get_global_step()
    weight_decay_inc = config.train.weight_decay_inc * (
        tf.cast(global_step, tf.float32) / tf.cast(train_steps, tf.float32))
    weight_decay = (1 + weight_decay_inc) * config.train.weight_decay
    utils.scalar('train/weight_decay', weight_decay)
    # Add weight decay to the loss for non-batch-normalization variables.
    matcher = re.compile(config.train.weight_decay_exclude)
    l2loss = weight_decay * tf.add_n([
        tf.nn.l2_loss(v)
        for v in tf.trainable_variables() if not matcher.match(v.name)
    ])
    loss = cross_entropy + l2loss
    utils.scalar('loss/l2reg', l2loss)
    utils.scalar('loss/xent', cross_entropy)

    if has_moving_average_decay:
        ema = tf.train.ExponentialMovingAverage(decay=config.train.ema_decay,
                                                num_updates=global_step)
        ema_vars = utils.get_ema_vars()

    host_call = None
    restore_vars_dict = None
    if is_training:
        # Compute the current epoch and associated learning rate from global_step.
        current_epoch = (tf.cast(global_step, tf.float32) /
                         params['steps_per_epoch'])
        utils.scalar('train/epoch', current_epoch)

        scaled_lr = config.train.lr_base * (config.train.batch_size / 256.0)
        scaled_lr_min = config.train.lr_min * (config.train.batch_size / 256.0)
        learning_rate = utils.WarmupLearningRateSchedule(
            scaled_lr,
            steps_per_epoch=params['steps_per_epoch'],
            decay_epochs=config.train.lr_decay_epoch,
            warmup_epochs=config.train.lr_warmup_epoch,
            decay_factor=config.train.lr_decay_factor,
            lr_decay_type=config.train.lr_sched,
            total_steps=train_steps,
            minimal_lr=scaled_lr_min)(global_step)
        utils.scalar('train/lr', learning_rate)
        optimizer = utils.build_optimizer(
            learning_rate, optimizer_name=config.train.optimizer)
        if FLAGS.use_tpu:
            # When using TPU, wrap the optimizer with CrossShardOptimizer which
            # handles synchronization details between different TPU cores. To the
            # user, this should look like regular synchronous training.
            optimizer = tf.tpu.CrossShardOptimizer(optimizer)

        # filter trainable variables if needed.
        var_list = tf.trainable_variables()
        if config.train.varsexp:
            vars2 = [
                v for v in var_list if re.match(config.train.varsexp, v.name)
            ]
            if len(vars2) == len(var_list):
                logging.warning('%s has no match.', config.train.freeze)
            logging.info('Filter variables: orig=%d, final=%d, delta=%d',
                         len(var_list), len(vars2),
                         len(var_list) - len(vars2))
            var_list = vars2

        # Batch norm requires update_ops to be added as a train_op dependency.
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        if config.train.gclip and is_training:
            logging.info('clip gradients norm by %f', config.train.gclip)
            grads_and_vars = optimizer.compute_gradients(loss, var_list)
            with tf.name_scope('gclip'):
                grads = [gv[0] for gv in grads_and_vars]
                tvars = [gv[1] for gv in grads_and_vars]
                utils.scalar('train/gnorm', tf.linalg.global_norm(grads))
                utils.scalar('train/gnormmax',
                             tf.math.reduce_max([tf.norm(g) for g in grads]))
                # First clip each variable's norm, then clip global norm.
                clip_norm = abs(config.train.gclip)
                clipped_grads = [
                    tf.clip_by_norm(g, clip_norm) if g is not None else None
                    for g in grads
                ]
                clipped_grads, _ = tf.clip_by_global_norm(
                    clipped_grads, clip_norm)
                grads_and_vars = list(zip(clipped_grads, tvars))

            with tf.control_dependencies(update_ops):
                train_op = optimizer.apply_gradients(grads_and_vars,
                                                     global_step)
        else:
            with tf.control_dependencies(update_ops):
                train_op = optimizer.minimize(loss,
                                              global_step,
                                              var_list=var_list)

        if has_moving_average_decay:
            with tf.control_dependencies([train_op]):
                train_op = ema.apply(ema_vars)

        if not config.runtime.skip_host_call:
            host_call = utils.get_tpu_host_call(
                global_step, FLAGS.model_dir,
                config.runtime.iterations_per_loop)
    else:
        train_op = None
        if has_moving_average_decay:
            # Load moving average variables for eval.
            restore_vars_dict = ema.variables_to_restore(ema_vars)

    eval_metrics = None
    if mode == tf.estimator.ModeKeys.EVAL:

        def metric_fn(labels, logits):
            """Evaluation metric function.

      Evaluates accuracy.

      This function is executed on the CPU and should not directly reference
      any Tensors in the rest of the `model_fn`. To pass Tensors from the model
      to the `metric_fn`, provide as part of the `eval_metrics`. See
      https://www.tensorflow.org/api_docs/python/tf/estimator/tpu/TPUEstimatorSpec
      for more information.

      Arguments should match the list of `Tensor` objects passed as the second
      element in the tuple passed to `eval_metrics`.

      Args:
        labels: `Tensor` with shape `[batch, num_classes]`.
        logits: `Tensor` with shape `[batch, num_classes]`.

      Returns:
        A dict of the metrics to return from evaluation.
      """
            metrics = {}
            if config.data.multiclass:
                metrics['eval/global_ap'] = tf.metrics.auc(
                    labels,
                    tf.nn.sigmoid(logits),
                    curve='PR',
                    num_thresholds=200,
                    summation_method='careful_interpolation',
                    name='global_ap')

                # Convert labels to set: be careful, tf.metrics.xx_at_k are horrible.
                labels = tf.cast(labels, dtype=tf.int64)
                label_to_repeat = tf.expand_dims(tf.argmax(labels, axis=-1),
                                                 axis=-1)
                all_labels_set = tf.range(0, labels.shape[-1], dtype=tf.int64)
                all_labels_set = tf.expand_dims(all_labels_set, axis=0)
                labels_set = labels * all_labels_set + (
                    1 - labels) * label_to_repeat

                metrics['eval/precision@1'] = tf.metrics.precision_at_k(
                    labels_set, logits, k=1)
                metrics['eval/recall@1'] = tf.metrics.recall_at_k(labels_set,
                                                                  logits,
                                                                  k=1)
                metrics['eval/precision@5'] = tf.metrics.precision_at_k(
                    labels_set, logits, k=5)
                metrics['eval/recall@5'] = tf.metrics.recall_at_k(labels_set,
                                                                  logits,
                                                                  k=5)

            # always add accuracy.
            labels = tf.argmax(labels, axis=1)
            predictions = tf.argmax(logits, axis=1)
            metrics['eval/acc_top1'] = tf.metrics.accuracy(labels, predictions)
            in_top_5 = tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32)
            metrics['eval/acc_top5'] = tf.metrics.mean(in_top_5)
            metrics['model/resolution'] = tf.metrics.mean(image_size)
            metrics['model/flops'] = tf.metrics.mean(num_flops)
            metrics['model/params'] = tf.metrics.mean(num_params)
            return metrics

        eval_metrics = (metric_fn, [labels, logits])

    if has_moving_average_decay and not is_training:

        def scaffold_fn():  # read ema for eval jobs.
            saver = tf.train.Saver(restore_vars_dict)
            return tf.train.Scaffold(saver=saver)
    elif config.train.ft_init_ckpt and is_training:

        def scaffold_fn():
            logging.info('restore variables from %s',
                         config.train.ft_init_ckpt)
            var_map = utils.get_ckpt_var_map(
                ckpt_path=config.train.ft_init_ckpt,
                skip_mismatch=True,
                init_ema=config.train.ft_init_ema)
            tf.train.init_from_checkpoint(config.train.ft_init_ckpt, var_map)
            return tf.train.Scaffold()
    else:
        scaffold_fn = None

    return tf.estimator.tpu.TPUEstimatorSpec(mode=mode,
                                             loss=loss,
                                             train_op=train_op,
                                             host_call=host_call,
                                             eval_metrics=eval_metrics,
                                             scaffold_fn=scaffold_fn)
Beispiel #26
0
def ensmable_train(args,
                   logger,
                   fold_i,
                   train_dataloader,
                   val_dataloader,
                   test_dataloader,
                   fold_path,
                   scaler=None,
                   epoch_steps=None):
    if logger is not None:
        debug, info = logger.debug, logger.info
    else:
        debug = info = print

    if args.gpu is not None and args.gpuUSE:
        torch.cuda.set_device(args.gpu)
        print(f'USE GPU ID={args.gpu}')

    debug(pformat(vars(args)))

    loss_func = get_loss_func(args)
    metric_func = get_metric_func(metric=args.metric)
    sum_predicts = []

    for model_idx in range(args.ensemble_size):

        save_dir = os.path.join(fold_path, f'model_{model_idx}')
        makedirs(save_dir)
        writer = SummaryWriter(log_dir=save_dir)

        if args.checkpoint_paths is not None:
            debug(
                f'Loading model {model_idx} from {args.checkpoint_paths[model_idx]}'
            )
            model = load_checkpoint(args.checkpoint_paths[model_idx],
                                    current_args=args,
                                    logger=logger)
        else:
            debug(f'Building model {model_idx}')
            model = build_model(args)

        debug(model)
        debug(
            f'mdoel:{model_idx}>>>>Number of parameters = {param_count(model):,}'
        )
        if args.gpuUSE:
            debug('Moving model to cuda')
            model = model.cuda()
        else:
            print('noGPU use')

        for name, param in model.named_parameters():
            if param.requires_grad:
                print(name, param.data,
                      f'param.data is GPU{param.data.is_cuda}')

        save_checkpoint(os.path.join(save_dir, f'model{model_idx}.pt'), model,
                        scaler, args)

        optimizer = build_optimizer(model, args)

        scheduler = build_lr_scheduler(optimizer, args, epoch_steps)

        print(
            f'args.minimize_score={args.minimize_score},args.metric={args.metric}'
        )
        best_score = float('inf') if args.minimize_score else -float('inf')
        best_epoch, n_iter = 0, 0
        hold_loss, hold_avgVal = [], []
        for epoch in trange(1, args.epochs + 1):
            steps_eachEpoch, args.train_data_size, lastAvageloss, epoch_loss = train_batch(
                args,
                fold_i,
                model,
                train_dataloader,
                loss_func=loss_func,
                optimizer=optimizer,
                scheduler=scheduler,
                logger=logger,
                writer=writer)
            hold_loss.append(epoch_loss)
            if isinstance(scheduler, ExponentialLR):
                scheduler.step()
            _, _, val_scores = evaluate_batch(args,
                                              model=model,
                                              data=val_dataloader,
                                              num_tasks=args.num_tasks,
                                              metric_func=metric_func,
                                              dataset_type=args.dataset_type,
                                              scaler=scaler,
                                              Foldth=args.Foldth,
                                              predsLog=None,
                                              logger=logger)

            avg_val_score = np.nanmean(val_scores)
            print(f'val_scores___{val_scores}')
            hold_avgVal.append(avg_val_score)

            debug(f'Validation {args.metric} = {avg_val_score:.6f}')
            writer.add_scalar(f'validation_{args.metric}_epoch', avg_val_score,
                              epoch)
            writer.add_scalar(f'train_loss_epoch', lastAvageloss, epoch)
            if args.show_individual_scores:

                for task_name, val_score in zip(args.task_names, val_scores):
                    debug(
                        f'Validation {task_name} {args.metric} = {val_score:.6f}'
                    )
                    writer.add_scalar(
                        f'validation_{task_name}_{args.metric}_epoch',
                        val_score, epoch)

            if args.minimize_score and avg_val_score < best_score or \
                    not args.minimize_score and avg_val_score > best_score:
                print(
                    f'debug args.minimize_score:{args.minimize_score} and {avg_val_score} < {best_score} '
                )
                best_score, best_epoch = avg_val_score, epoch
                save_checkpoint(
                    os.path.join(save_dir, f'{args.data_filename}_model.pt'),
                    model, scaler, args)

                info(
                    f'Model {model_idx} the parametrs updated in model.pt as best validation {args.metric} = {best_score:.6f} on epoch {best_epoch}'
                )

        train_valdation_curve(args,
                              fold_i,
                              model_idx,
                              cur_name='train-valdation curve')

        info(
            f'Model {model_idx} best validation {args.metric} = {best_score:.6f} on epoch {best_epoch}'
        )
        print(f'load model with args.cuda={args.cuda}')
        model = load_checkpoint(os.path.join(save_dir,
                                             f'{args.data_filename}_model.pt'),
                                cuda=args.cuda,
                                logger=logger)

        test_targets, test_preds, test_scores = evaluate_batch(
            args,
            model=model,
            data=test_dataloader,
            num_tasks=args.num_tasks,
            metric_func=metric_func,
            dataset_type=args.dataset_type,
            scaler=scaler,
            logger=logger,
            Foldth=args.Foldth,
            predsLog=args.save_dir)
        if len(test_preds) != 0:
            sum_predicts.append(np.stack(test_preds, axis=0))
            print(f'sum_predicts ={sum_predicts}')

        avg_test_score = np.nanmean(test_scores)
        info(
            f'Model {model_idx} test >>>  {args.metric} = {avg_test_score:.6f}'
        )
        writer.add_scalar(f'test_{args.metric}_modelID', avg_test_score,
                          model_idx)
        if args.show_individual_scores:

            for task_name, test_score in zip(args.task_names, test_scores):
                info(
                    f'Model {model_idx} test {task_name} {args.metric} = {test_score:.6f}'
                )
                writer.add_scalar(f'test_{task_name}_{args.metric}_ModelID',
                                  test_score, model_idx)
    sum_predict = np.zeros(sum_predicts[0].shape)
    print(len(sum_predicts))
    for mode_pred in sum_predicts:
        sum_predict = sum_predict + mode_pred

    avg_test_preds = (sum_predict / args.ensemble_size).tolist()
    if args.dataset_type == 'classification':
        all_classificationScores = evaluate_predictionsWithAllmetric(
            preds=avg_test_preds,
            targets=test_targets,
            num_tasks=args.num_tasks,
            metric_func={
                'auc': roc_auc_score,
                'acc': acc_score,
                'precision': prec_score,
                'recall': rec_score,
                'prec_auc': prec_rec_auc
            },
            dataset_type=args.dataset_type,
            logger=logger)
        print(f'use the metric {args.metric} for Hyperparameter Optimization')
        ensemble_scores = all_classificationScores[args.metric]
        all_metricsScore = all_classificationScores
    if args.dataset_type == 'regression':
        all_regressionScores = evaluate_predictionsWithAllmetric(
            preds=avg_test_preds,
            targets=test_targets,
            num_tasks=args.num_tasks,
            metric_func={
                'rmse': rmse,
                'mse': mean_squared_error,
                'mae': mean_absolute_error,
                'r2': r2_score,
                'PC': Pearson_cor
            },
            dataset_type=args.dataset_type,
            logger=logger)
        print(f'use the metric {args.metric} for Hyperparameter Optimization')
        ensemble_scores = all_regressionScores[args.metric]
        all_metricsScore = all_regressionScores

    avg_ensemble_test_score = np.nanmean(ensemble_scores)
    print(
        f'ensemble_scores={ensemble_scores} and test {args.metric} = {avg_ensemble_test_score:.6f}'
    )
    writer.add_scalar(f'ensemble_test_{args.metric}_fold',
                      avg_ensemble_test_score, fold_i)

    if args.show_individual_scores:
        for task_name, ensemble_score in zip(args.task_names, ensemble_scores):
            info(
                f'Ensemble test {task_name} {args.metric} = {ensemble_score:.6f}'
            )
    return ensemble_scores, all_metricsScore
Beispiel #27
0
def train_loop(
    run_id,
    dataset_dir,
    ckpt_run_dir,
    output_dir,
    validation_only=False,
    use_cuda=False,
    light_target=False,
):
    """Train loop"""
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    rank = dist.get_rank()
    world_size = dist.get_world_size()

    train_epochs = 8
    train_min_len, train_max_len = 0, 75
    val_min_len, val_max_len = 0, 150
    math_mode = "fp16"  # One of `fp16`, `fp32`
    lang = ("en", "de")

    # Training
    train_global_batch_size = 2048  # Global batch size
    max_bs = 128  # Max batch size for used hardware
    update_freq = int(max(1, train_global_batch_size // (max_bs * world_size)))
    train_batch_size = int(train_global_batch_size // (world_size * update_freq))
    val_batch_size = 64

    # Model attributes
    model_args = {
        "hidden_size": 1024,
        "num_layers": 4,
        "dropout": 0.2,
        "share_embedding": True,
        "fusion": True,
    }

    # Criterion
    criterion_args = {"smoothing": 0.1, "fast_xentropy": True}

    # Loss scaling
    loss_scaling = {"init_scale": 1024, "upscale_interval": 128}

    # Optimizer
    optimizer_args = {
        "lr": 2e-3,
        "grad_clip": 5.0,
    }

    # Scheduler
    scheduler_args = {
        "warmup_steps": 200,
        "remain_steps": 0.4,
        "decay_interval": 0.05,
        "decay_steps": 4,
        "decay_factor": 0.5,
    }

    # Translator
    translator_args = {
        "beam_size": 5,
        "len_norm_factor": 0.6,
        "cov_penalty_factor": 0.1,
        "len_norm_const": 5.0,
        "max_seq_len": 150,
    }

    # Build train/val datsets
    train_set = WMT16Dataset(
        dataset_dir,
        math_precision=math_mode,
        lang=lang,
        train=True,
        download=True,
        preprocessed=True,
        min_len=train_min_len,
        max_len=train_max_len,
    )
    train_set.prepare()
    val_set = WMT16Dataset(
        dataset_dir,
        math_precision=math_mode,
        lang=lang,
        validation=True,
        download=False,
        min_len=val_min_len,
        max_len=val_max_len,
        sort=True,
    )

    tokenizer = train_set.tokenizer

    # Build model
    model = GNMT(vocab_size=train_set.vocab_size, **model_args)

    # Build loss function
    criterion = LabelSmoothing(padding_idx=wmt16_config.PAD, **criterion_args)

    # Bilingual Evaluation Understudy Score
    metrics = [BLEUScore()]

    # Partition data
    train_set = partition_dataset_by_rank(train_set, rank, world_size)
    val_set = partition_dataset_by_rank(val_set, rank, world_size)

    collate_fn = build_collate_fn(sort=True)
    train_loader = DataLoader(
        train_set,
        batch_size=train_batch_size,
        collate_fn=collate_fn,
        num_workers=2,
        pin_memory=True,
        drop_last=False,
        shuffle=True,
    )

    val_loader = DataLoader(
        val_set,
        batch_size=val_batch_size,
        collate_fn=collate_fn,
        num_workers=2,
        pin_memory=True,
        drop_last=False,
    )

    validate_every = update_freq * round(
        len(train_loader) * 0.30 / update_freq
    )  # Validate every 30%

    # Build optimizer & scheduler
    total_train_iters = (len(train_loader) // update_freq) * train_epochs

    print("Number of batches per epoch {}".format(len(train_loader)))
    print("Train iterations per epoch {}".format(total_train_iters / train_epochs))

    if use_cuda:
        model = model.cuda()
        criterion = criterion.cuda()

    use_horovod = math_mode == "fp16" and dist.get_backend() == dist.Backend.MPI

    if use_horovod:
        hvd.init()
        logger.info("Using horovod rank={}".format(hvd.rank()))
        tensor = torch.tensor([1])
        res = hvd.allreduce(tensor, op=hvd.Sum)
        assert res[0] == world_size

    fp_optimizer, optimizer, model = build_optimizer(
        model=model,
        math=math_mode,
        loss_scaling=loss_scaling,
        use_cuda=use_cuda,
        use_horovod=use_horovod,
        **optimizer_args
    )

    # Create a learning rate scheduler for an optimizer
    scheduler = ExponentialWarmupMultiStepLR(
        optimizer, total_train_iters, **scheduler_args
    )

    # Translator
    translator = Translator(model=model, trg_tokenizer=tokenizer, **translator_args)

    checkpointer = Checkpointer(
        ckpt_run_dir=ckpt_run_dir, rank=rank, freq=CheckpointFreq.BEST
    )

    if not validation_only:

        if light_target:
            goal = task4_time_to_bleu_goal(20)
        else:
            goal = task4_time_to_bleu_goal(24)

        num_batches_per_device_train = len(train_loader)
        tracker = Tracker(metrics, run_id, rank, goal=goal)

        dist.barrier()
        tracker.start()

        for epoch in range(0, train_epochs):
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

            model.train()
            tracker.train()
            for batch_idx, (data, target) in enumerate(train_loader):
                tracker.batch_start()
                data, target = prepare_batch(data, target, use_cuda=use_cuda)
                tracker.record_batch_load()

                is_last = batch_idx == len(train_loader)
                update = (batch_idx % update_freq) == update_freq - 1
                init = (batch_idx % update_freq) == 0

                # Clear gradients in the optimizer.
                if init:
                    fp_optimizer.zero_grad()
                    tracker.record_batch_init()

                # Compute the output
                output = compute_model_output(model, data, target)
                tracker.record_batch_fwd_pass()

                # Compute the loss
                loss, loss_per_token = compute_loss(
                    data, target, output, criterion, update_freq
                )
                tracker.record_batch_comp_loss()
                # Backprop
                fp_optimizer.backward_loss(loss)
                tracker.record_batch_backprop()

                # Opt step
                if update or is_last:
                    # For this task, simply sum all gradients
                    updated = fp_optimizer.step(tracker=tracker, denom=1)

                    # Learning rate scheduler
                    if updated:
                        scheduler.step()

                tracker.batch_end()

                record_train_batch_stats(
                    batch_idx=batch_idx,
                    loss=loss_per_token,
                    output=target[0],  # Use target just for the size
                    metric_results={},
                    tracker=tracker,
                    num_batches_per_device_train=num_batches_per_device_train,
                )

                # Validation during training
                if (batch_idx + 1) % validate_every == 0:
                    if torch.cuda.is_available():
                        torch.cuda.empty_cache()

                    metrics_values, loss = validation_round(
                        val_loader,
                        metrics,
                        model,
                        criterion,
                        update_freq,
                        translator,
                        tracker=tracker,
                        use_cuda=use_cuda,
                    )

                    record_validation_stats(metrics_values, loss, tracker, rank)
                    if tracker.goal_reached:
                        break

                    model.train()
                    tracker.train()

            if torch.cuda.is_available():
                torch.cuda.empty_cache()

            metrics_values, loss = validation_round(
                val_loader,
                metrics,
                model,
                criterion,
                update_freq,
                translator,
                use_cuda=use_cuda,
            )

            is_best = record_validation_stats(metrics_values, loss, tracker, rank)

            checkpointer.save(
                tracker,
                model,
                fp_optimizer.optimizer,
                scheduler,
                tracker.current_epoch,
                is_best,
            )

            tracker.epoch_end()

            if tracker.goal_reached:
                print("Goal Reached!")
                dist.barrier()
                time.sleep(10)
                return
    else:
        cecf = CheckpointsEvaluationControlFlow(
            ckpt_dir=ckpt_run_dir,
            rank=rank,
            world_size=world_size,
            checkpointer=checkpointer,
            model=model,
            epochs=train_epochs,
            loss_function=criterion,
            metrics=metrics,
            use_cuda=use_cuda,
            dtype="fp32",
            max_batch_per_epoch=None,
        )

        train_stats = cecf.evaluate_by_epochs(train_loader)
        with open(os.path.join(output_dir, "train_stats.json"), "w") as f:
            json.dump(train_stats, f)

        val_stats = cecf.evaluate_by_epochs(val_loader)
        with open(os.path.join(output_dir, "val_stats.json"), "w") as f:
            json.dump(val_stats, f)
Beispiel #28
0
    def model_fn(features, labels, mode, params=None):
        """The model_fn to be used with TPUEstimator.

        Args:
            features: `Tensor` of batched images.
            labels: `Tensor` of one hot labels for the data samples
            mode: one of `tf.estimator.ModeKeys.{TRAIN,EVAL,PREDICT}`

        Returns:
        A `TPUEstimatorSpec` for the model
        """
        if isinstance(features, dict):
            features = features["feature"]

        # In most cases, the default data format NCHW instead of NHWC should be
        # used for a significant performance boost on GPU. NHWC should be used
        # only if the network needs to be run on CPU since the pooling operations
        # are only supported on NHWC. TPU uses XLA compiler to figure out best layout.
        if context.get_hparam("data_format") == "channels_first":
            assert not context.get_hparam("transpose_input")  # channels_first only for GPU
            features = tf.transpose(features, [0, 3, 1, 2])
            stats_shape = [3, 1, 1]
        else:
            stats_shape = [1, 1, 3]

        #if context.get_hparam("transpose_input") and mode != tf.estimator.ModeKeys.PREDICT:
        #    features = tf.transpose(features, [3, 0, 1, 2])  # HWCN to NHWC

        is_training = mode == tf.estimator.ModeKeys.TRAIN
        has_moving_average_decay = context.get_hparam("moving_average_decay") > 0
        # This is essential, if using a keras-derived model.
        tf.keras.backend.set_learning_phase(is_training)
        logging.info("Using open-source implementation.")
        override_params = {}
        #if context.get_hparam("batch_norm_momentum") is not None:
        #    override_params["batch_norm_momentum"] = context.get_hparam("batch_norm_momentum")
        #if context.get_hparam("batch_norm_epsilon") is not None:
        #    override_params["batch_norm_epsilon"] = context.get_hparam("batch_norm_epsilon")
       # if context.get_hparam("dropout_rate") is not None:
       #     override_params["dropout_rate"] = context.get_hparam("dropout_rate")
       # if context.get_hparam("survival_prob") is not None:
       #     override_params["survival_prob"] = context.get_hparam("survival_prob")
       # if context.get_hparam("data_format"):
       #     override_params["data_format"] = context.get_hparam("data_format")
       # if context.get_hparam("num_label_classes"):
       #     override_params["num_classes"] = context.get_hparam("num_label_classes")
       # if context.get_hparam("depth_coefficient"):
       #     override_params["depth_coefficient"] = context.get_hparam("depth_coefficient")
       # if context.get_hparam("width_coefficient"):
       #     override_params["width_coefficient"] = context.get_hparam("width_coefficient")

        def normalize_features(features, mean_rgb, stddev_rgb):
            """Normalize the image given the means and stddevs."""
            features -= tf.constant(mean_rgb, shape=stats_shape, dtype=features.dtype)
            features /= tf.constant(stddev_rgb, shape=stats_shape, dtype=features.dtype)
            return features

        def build_model():
            """Build model using the model_name given through the command line."""
            model_builder = model_builder_factory.get_model_builder(
                context.get_hparam("model_name"),
            )
            normalized_features = normalize_features(
                features, model_builder.MEAN_RGB, model_builder.STDDEV_RGB
            )
            logits, _ = model_builder.build_model(
                normalized_features,
                model_name=context.get_hparam("model_name"),
                training=is_training,
                override_params=override_params,
                #model_dir=context.get_hparam("model_dir"),
            )
            return logits

        logits = build_model()

        # Calculate loss, which includes softmax cross entropy and L2 regularization.
        cross_entropy = tf.losses.softmax_cross_entropy(
            logits=logits, onehot_labels=labels, label_smoothing=context.get_hparam("label_smoothing")
        )

        # Add weight decay to the loss for non-batch-normalization variables.
        loss = cross_entropy + context.get_hparam("weight_decay") * tf.add_n(
            [
                tf.nn.l2_loss(v)
                for v in tf.trainable_variables()
                if "batch_normalization" not in v.name
            ]
        )

        global_step = tf.train.get_global_step()
        if has_moving_average_decay:
            ema = tf.train.ExponentialMovingAverage(
                decay=context.get_hparam("moving_average_decay"), num_updates=global_step
            )
            ema_vars = utils.get_ema_vars()

        restore_vars_dict = None
        train_op = None
        if is_training:
            # Compute the current epoch and associated learning rate from global_step.
            current_epoch = tf.cast(global_step, tf.float32) / context.get_hparam("steps_per_epoch")

            scaled_lr = context.get_hparam("base_learning_rate") * (context.get_hparam("train_batch_size") / 256.0)
            logging.info("base_learning_rate = %f", context.get_hparam("base_learning_rate"))
            learning_rate = utils.build_learning_rate(
                scaled_lr, global_step, context.get_hparam("steps_per_epoch"),
            )
            optimizer = utils.build_optimizer(context, learning_rate)

            # Batch normalization requires UPDATE_OPS to be added as a dependency to
            # the train operation.
            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
            with tf.control_dependencies(update_ops):
                train_op = optimizer.minimize(loss, global_step)

            if has_moving_average_decay:
                with tf.control_dependencies([train_op]):
                    train_op = ema.apply(ema_vars)

        if has_moving_average_decay:
            # Load moving average variables for eval.
            restore_vars_dict = ema.variables_to_restore(ema_vars)

        eval_metrics = None
        if mode == tf.estimator.ModeKeys.EVAL:

            def metric_fn(labels, logits):
                """Evaluation metric function. Evaluates accuracy.

                This function is executed on the CPU and should not directly reference
                any Tensors in the rest of the `model_fn`. To pass Tensors from the model
                to the `metric_fn`, provide as part of the `eval_metrics`. See
                https://www.tensorflow.org/api_docs/python/tf/estimator/tpu/TPUEstimatorSpec
                for more information.

                Arguments should match the list of `Tensor` objects passed as the second
                element in the tuple passed to `eval_metrics`.

                Args:
                    labels: `Tensor` with shape `[batch, num_classes]`.
                    logits: `Tensor` with shape `[batch, num_classes]`.

                Returns:
                    A dict of the metrics to return from evaluation.
                """
                labels = tf.argmax(labels, axis=1)
                predictions = tf.argmax(logits, axis=1)
                top_1_accuracy = tf.metrics.accuracy(labels, predictions)
                in_top_5 = tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32)
                top_5_accuracy = tf.metrics.mean(in_top_5)

                return {
                    "top_1_accuracy": top_1_accuracy,
                    "top_5_accuracy": top_5_accuracy,
                }

            eval_metrics = metric_fn(labels, logits)

        num_params = np.sum([np.prod(v.shape) for v in tf.trainable_variables()])
        logging.info("number of trainable parameters: %d", num_params)


        return tf.estimator.EstimatorSpec(
            mode=mode, loss=loss, train_op=train_op, eval_metric_ops=eval_metrics,
        )
Beispiel #29
0
def model_fn(features, labels, mode, params):
    """The model_fn to be used with TPUEstimator.

  Args:
    features: `Tensor` of batched images.
    labels: `Tensor` of one hot labels for the data samples
    mode: one of `tf.estimator.ModeKeys.{TRAIN,EVAL,PREDICT}`
    params: `dict` of parameters passed to the model from the TPUEstimator,
        `params['batch_size']` is always provided and should be used as the
        effective batch size.

  Returns:
    A `TPUEstimatorSpec` for the model
  """
    if isinstance(features, dict):
        features = features['feature']

    # In most cases, the default data format NCHW instead of NHWC should be
    # used for a significant performance boost on GPU. NHWC should be used
    # only if the network needs to be run on CPU since the pooling operations
    # are only supported on NHWC. TPU uses XLA compiler to figure out best layout.
    if FLAGS.data_format == 'channels_first':
        assert not FLAGS.transpose_input  # channels_first only for GPU
        features = tf.transpose(features, [0, 3, 1, 2])
        stats_shape = [3, 1, 1]
    else:
        stats_shape = [1, 1, 3]

    if FLAGS.transpose_input and mode != tf.estimator.ModeKeys.PREDICT:
        features = tf.transpose(features, [3, 0, 1, 2])  # HWCN to NHWC

    is_training = (mode == tf.estimator.ModeKeys.TRAIN)
    has_moving_average_decay = (FLAGS.moving_average_decay > 0)
    # This is essential, if using a keras-derived model.
    tf.keras.backend.set_learning_phase(is_training)
    logging.info('Using open-source implementation.')
    override_params = {}
    if FLAGS.batch_norm_momentum is not None:
        override_params['batch_norm_momentum'] = FLAGS.batch_norm_momentum
    if FLAGS.batch_norm_epsilon is not None:
        override_params['batch_norm_epsilon'] = FLAGS.batch_norm_epsilon
    if FLAGS.dropout_rate is not None:
        override_params['dropout_rate'] = FLAGS.dropout_rate
    if FLAGS.survival_prob is not None:
        override_params['survival_prob'] = FLAGS.survival_prob
    if FLAGS.data_format:
        override_params['data_format'] = FLAGS.data_format
    if FLAGS.num_label_classes:
        override_params['num_classes'] = FLAGS.num_label_classes
    if FLAGS.depth_coefficient:
        override_params['depth_coefficient'] = FLAGS.depth_coefficient
    if FLAGS.width_coefficient:
        override_params['width_coefficient'] = FLAGS.width_coefficient

    def normalize_features(features, mean_rgb, stddev_rgb):
        """Normalize the image given the means and stddevs."""
        features -= tf.constant(mean_rgb,
                                shape=stats_shape,
                                dtype=features.dtype)
        features /= tf.constant(stddev_rgb,
                                shape=stats_shape,
                                dtype=features.dtype)
        return features

    def build_model():
        """Build model using the model_name given through the command line."""
        model_builder = model_builder_factory.get_model_builder(
            FLAGS.model_name)
        normalized_features = normalize_features(features,
                                                 model_builder.MEAN_RGB,
                                                 model_builder.STDDEV_RGB)
        logits, _ = model_builder.build_model(normalized_features,
                                              model_name=FLAGS.model_name,
                                              training=is_training,
                                              override_params=override_params,
                                              model_dir=FLAGS.model_dir)
        return logits

    if params['use_bfloat16']:
        with tf.tpu.bfloat16_scope():
            logits = tf.cast(build_model(), tf.float32)
    else:
        logits = build_model()

    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {
            'classes': tf.argmax(logits, axis=1),
            'probabilities': tf.nn.softmax(logits, name='softmax_tensor')
        }
        return tf.estimator.EstimatorSpec(
            mode=mode,
            predictions=predictions,
            export_outputs={
                'classify': tf.estimator.export.PredictOutput(predictions)
            })

    # If necessary, in the model_fn, use params['batch_size'] instead the batch
    # size flags (--train_batch_size or --eval_batch_size).
    batch_size = params['batch_size']  # pylint: disable=unused-variable

    # Calculate loss, which includes softmax cross entropy and L2 regularization.
    cross_entropy = tf.losses.softmax_cross_entropy(
        logits=logits,
        onehot_labels=labels,
        label_smoothing=FLAGS.label_smoothing)

    # Add weight decay to the loss for non-batch-normalization variables.
    loss = cross_entropy + FLAGS.weight_decay * tf.add_n([
        tf.nn.l2_loss(v) for v in tf.trainable_variables()
        if 'batch_normalization' not in v.name
    ])

    global_step = tf.train.get_global_step()
    if has_moving_average_decay:
        ema = tf.train.ExponentialMovingAverage(
            decay=FLAGS.moving_average_decay, num_updates=global_step)
        ema_vars = utils.get_ema_vars()

    host_call = None
    restore_vars_dict = None
    if is_training:
        # Compute the current epoch and associated learning rate from global_step.
        current_epoch = (tf.cast(global_step, tf.float32) /
                         params['steps_per_epoch'])

        scaled_lr = FLAGS.base_learning_rate * (FLAGS.train_batch_size / 256.0)
        logging.info('base_learning_rate = %f', FLAGS.base_learning_rate)
        learning_rate = utils.build_learning_rate(
            scaled_lr,
            global_step,
            params['steps_per_epoch'],
            decay_epochs=FLAGS.lr_decay_epoch)
        optimizer = utils.build_optimizer(learning_rate)
        if FLAGS.use_tpu:
            # When using TPU, wrap the optimizer with CrossShardOptimizer which
            # handles synchronization details between different TPU cores. To the
            # user, this should look like regular synchronous training.
            optimizer = tf.tpu.CrossShardOptimizer(optimizer)

        # Batch normalization requires UPDATE_OPS to be added as a dependency to
        # the train operation.
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            train_op = optimizer.minimize(loss, global_step)

        if has_moving_average_decay:
            with tf.control_dependencies([train_op]):
                train_op = ema.apply(ema_vars)

        if not FLAGS.skip_host_call:

            def host_call_fn(gs, lr, ce):
                """Training host call. Creates scalar summaries for training metrics.

        This function is executed on the CPU and should not directly reference
        any Tensors in the rest of the `model_fn`. To pass Tensors from the
        model to the `metric_fn`, provide as part of the `host_call`. See
        https://www.tensorflow.org/api_docs/python/tf/estimator/tpu/TPUEstimatorSpec
        for more information.

        Arguments should match the list of `Tensor` objects passed as the second
        element in the tuple passed to `host_call`.

        Args:
          gs: `Tensor with shape `[batch]` for the global_step
          lr: `Tensor` with shape `[batch]` for the learning_rate.
          ce: `Tensor` with shape `[batch]` for the current_epoch.

        Returns:
          List of summary ops to run on the CPU host.
        """
                gs = gs[0]
                # Host call fns are executed FLAGS.iterations_per_loop times after one
                # TPU loop is finished, setting max_queue value to the same as number of
                # iterations will make the summary writer only flush the data to storage
                # once per loop.
                with tf2.summary.create_file_writer(
                        FLAGS.model_dir,
                        max_queue=FLAGS.iterations_per_loop).as_default():
                    with tf2.summary.record_if(True):
                        tf2.summary.scalar('learning_rate', lr[0], step=gs)
                        tf2.summary.scalar('current_epoch', ce[0], step=gs)

                        return tf.summary.all_v2_summary_ops()

            # To log the loss, current learning rate, and epoch for Tensorboard, the
            # summary op needs to be run on the host CPU via host_call. host_call
            # expects [batch_size, ...] Tensors, thus reshape to introduce a batch
            # dimension. These Tensors are implicitly concatenated to
            # [params['batch_size']].
            gs_t = tf.reshape(global_step, [1])
            lr_t = tf.reshape(learning_rate, [1])
            ce_t = tf.reshape(current_epoch, [1])

            host_call = (host_call_fn, [gs_t, lr_t, ce_t])

    else:
        train_op = None
        if has_moving_average_decay:
            # Load moving average variables for eval.
            restore_vars_dict = ema.variables_to_restore(ema_vars)

    eval_metrics = None
    if mode == tf.estimator.ModeKeys.EVAL:

        def metric_fn(labels, logits):
            """Evaluation metric function. Evaluates accuracy.

      This function is executed on the CPU and should not directly reference
      any Tensors in the rest of the `model_fn`. To pass Tensors from the model
      to the `metric_fn`, provide as part of the `eval_metrics`. See
      https://www.tensorflow.org/api_docs/python/tf/estimator/tpu/TPUEstimatorSpec
      for more information.

      Arguments should match the list of `Tensor` objects passed as the second
      element in the tuple passed to `eval_metrics`.

      Args:
        labels: `Tensor` with shape `[batch, num_classes]`.
        logits: `Tensor` with shape `[batch, num_classes]`.

      Returns:
        A dict of the metrics to return from evaluation.
      """
            labels = tf.argmax(labels, axis=1)
            predictions = tf.argmax(logits, axis=1)
            top_1_accuracy = tf.metrics.accuracy(labels, predictions)
            in_top_5 = tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32)
            top_5_accuracy = tf.metrics.mean(in_top_5)

            return {
                'top_1_accuracy': top_1_accuracy,
                'top_5_accuracy': top_5_accuracy,
            }

        eval_metrics = (metric_fn, [labels, logits])

    num_params = np.sum([np.prod(v.shape) for v in tf.trainable_variables()])
    logging.info('number of trainable parameters: %d', num_params)

    def _scaffold_fn():
        saver = tf.train.Saver(restore_vars_dict)
        return tf.train.Scaffold(saver=saver)

    if has_moving_average_decay and not is_training:
        # Only apply scaffold for eval jobs.
        scaffold_fn = _scaffold_fn
    else:
        scaffold_fn = None

    return tf.estimator.tpu.TPUEstimatorSpec(mode=mode,
                                             loss=loss,
                                             train_op=train_op,
                                             host_call=host_call,
                                             eval_metrics=eval_metrics,
                                             scaffold_fn=scaffold_fn)
Beispiel #30
0
    model = tagging_model.BertForMultitask.from_pretrained(
        ARGS.bert_model,
        cls_num_labels=ARGS.num_categories,
        tok_num_labels=ARGS.num_tok_labels,
        cache_dir=ARGS.working_dir + '/cache',
        tok2id=tok2id)
if CUDA:
    model = model.cuda()
    print("cuda available")

print('PREPPING RUN...')

# # # # # # # # ## # # # ## # # OPTIMIZER, LOSS # # # # # # # # ## # # # ## # #

optimizer = tagging_utils.build_optimizer(
    model, int((num_train_examples * ARGS.epochs) / ARGS.train_batch_size),
    ARGS.learning_rate)

loss_fn = tagging_utils.build_loss_fn()

# # # # # # # # ## # # # ## # # TRAIN # # # # # # # # ## # # # ## # #

writer = SummaryWriter(ARGS.working_dir)

print('INITIAL EVAL...')
model.eval()
results = tagging_utils.run_inference(model, eval_dataloader, loss_fn,
                                      tokenizer)
writer.add_scalar('eval/tok_loss', np.mean(results['tok_loss']), 0)
writer.add_scalar('eval/tok_acc', np.mean(results['labeling_hits']), 0)