def main(hparams):
    dataset, tokenizer = get_dataset(hparams)

    model = transformer(hparams)

    optimizer = tf.keras.optimizers.Adam(CustomSchedule(hparams),
                                         beta_1=0.9,
                                         beta_2=0.98,
                                         epsilon=1e-9)

    def loss_function(y_true, y_pred):
        y_true = tf.reshape(y_true, shape=(-1, hparams.max_length - 1))
        loss = tf.keras.losses.SparseCategoricalCrossentropy(
            from_logits=True, reduction='none')(y_true, y_pred)

        mask = tf.cast(tf.not_equal(y_true, 0), tf.float32)
        loss = tf.multiply(loss, mask)

        return tf.reduce_mean(loss)

    def accuracy(y_true, y_pred):
        y_true = tf.reshape(y_true, shape=(-1, hparams.max_length - 1))
        return tf.metrics.SparseCategoricalAccuracy()(y_true, y_pred)

    model.compile(optimizer, loss=loss_function, metrics=[accuracy])

    model.fit(dataset, epochs=hparams.epochs)

    evaluate(hparams, model, tokenizer)
Example #2
0
  def model_fn(inp, tgt, mems, is_training):
    inp = tf.transpose(inp, [1, 0])
    tgt = tf.transpose(tgt, [1, 0])

    if FLAGS.init == "uniform":
      initializer = tf.initializers.random_uniform(
          minval=-FLAGS.init_range,
          maxval=FLAGS.init_range,
          seed=None)
    elif FLAGS.init == "normal":
      initializer = tf.initializers.random_normal(
          stddev=FLAGS.init_std,
          seed=None)
      proj_initializer = tf.initializers.random_normal(
          stddev=FLAGS.proj_init_std,
          seed=None)

    tie_projs = [False for _ in range(len(cutoffs) + 1)]
    if FLAGS.proj_share_all_but_first:
      for i in range(1, len(tie_projs)):
        tie_projs[i] = True

    loss, new_mems = model.transformer(
        dec_inp=inp,
        target=tgt,
        mems=mems,
        n_token=n_token,
        n_layer=FLAGS.n_layer,
        d_model=FLAGS.d_model,
        d_embed=FLAGS.d_embed,
        n_head=FLAGS.n_head,
        d_head=FLAGS.d_head,
        d_inner=FLAGS.d_inner,
        dropout=FLAGS.dropout,
        dropatt=FLAGS.dropatt,
        initializer=initializer,
        proj_initializer=proj_initializer,
        is_training=is_training,
        mem_len=FLAGS.mem_len,
        cutoffs=cutoffs,
        div_val=FLAGS.div_val,
        tie_projs=tie_projs,
        input_perms=None,
        target_perms=None,
        head_target=None,
        same_length=FLAGS.same_length,
        clamp_len=FLAGS.clamp_len,
        untie_r=FLAGS.untie_r,
        proj_same_dim=FLAGS.proj_same_dim)

    # number of parameters
    num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()])
    tf.logging.info('#params: {}'.format(num_params))

    if is_training:
      all_vars = tf.trainable_variables()

      return loss, new_mems, all_vars
    else:
      return loss, new_mems
Example #3
0
def train(inputs, outputs, pre_train=False):
    tf.keras.backend.clear_session()
    dataset, VOCAB_SIZE, _ = get_dataset(inputs, outputs)
    if pre_train:
        model = tf.keras.models.load_model(config.MODEL_PATH)

    else:
        model = transformer(vocab_size=VOCAB_SIZE,
                            num_layers=config.NUM_LAYERS,
                            units=config.UNITS,
                            d_model=config.D_MODEL,
                            num_heads=config.NUM_HEADS,
                            dropout=config.DROPOUT)
        learning_rate = model.CustomSchedule(config.D_MODEL)

        optimizer = tf.keras.optimizers.Adam(learning_rate,
                                             beta_1=0.9,
                                             beta_2=0.98,
                                             epsilon=1e-9)

        model.compile(optimizer=optimizer,
                      loss=model.loss_function,
                      metrics=[model.accuracy])

        model.fit(dataset, epochs=config.EPOCHS)

        model.save(config.MODEL_PATH)
    def model_fn(inp, tgt, mems, is_training):
        inp = tf.transpose(inp, [1, 0])
        tgt = tf.transpose(tgt, [1, 0])

        initializer = tf.initializers.random_uniform(minval=-FLAGS.init_range,
                                                     maxval=FLAGS.init_range,
                                                     seed=None)
        proj_initializer = tf.initializers.random_normal(
            stddev=FLAGS.proj_init_std, seed=None)

        tie_projs = [False for _ in range(len(cutoffs) + 1)]
        if FLAGS.proj_share_all_but_first:
            for i in range(1, len(tie_projs)):
                tie_projs[i] = True

        loss, new_mems, outputs = model.transformer(
            dec_inp=inp,
            target=tgt,
            mems=mems,
            n_token=n_token,
            n_layer=FLAGS.n_layer,
            d_model=FLAGS.d_model,
            d_embed=FLAGS.d_embed,
            n_head=FLAGS.n_head,
            d_head=FLAGS.d_head,
            d_inner=FLAGS.d_inner,
            dropout=FLAGS.dropout,
            dropatt=FLAGS.dropatt,
            initializer=initializer,
            proj_initializer=proj_initializer,
            is_training=is_training,
            mem_len=FLAGS.mem_len,
            cutoffs=cutoffs,
            div_val=FLAGS.div_val,
            tie_projs=tie_projs,
            input_perms=None,
            target_perms=None,
            head_target=None,
            same_length=FLAGS.same_length,
            clamp_len=FLAGS.clamp_len,
            use_tpu=False,
            untie_r=FLAGS.untie_r,
            proj_same_dim=FLAGS.proj_same_dim,
            return_outputs=True)

        if is_training:
            all_vars = tf.trainable_variables()
            grads = tf.gradients(loss, all_vars)
            grads_and_vars = list(zip(grads, all_vars))

            return loss, new_mems, grads_and_vars

        return loss, new_mems, outputs
Example #5
0
def main(params):

    print("\n ...loading dataset\n")
    dataset, test_dataset, tokenizer, meta = get_dataset(
        params.max_samples,
        params.max_length,
        params.batch_size,
        validation_split=params.validation_split)

    print("\n ...creating model\n")
    model = transformer(params.d_model, meta['vocab_size'], params.num_layers,
                        params.num_heads, params.dff, params.rate)

    # saving model without compilation
    model.save('model_untrained.h5')
    optimizer = tf.keras.optimizers.Adam(CustomSchedule(params.d_model),
                                         beta_1=0.9,
                                         beta_2=0.98,
                                         epsilon=1e-9)

    def loss_function(y_true, y_pred):
        y_true = tf.reshape(y_true, shape=(-1, params.max_length - 1))
        loss = tf.keras.losses.SparseCategoricalCrossentropy(
            from_logits=True, reduction='none')(y_true, y_pred)

        mask = tf.cast(tf.not_equal(y_true, 0), tf.float32)
        loss = tf.multiply(loss, mask)

        return tf.reduce_mean(loss)

    def accuracy(y_true, y_pred):
        y_true = tf.reshape(y_true, shape=(-1, params.max_length - 1))
        return tf.keras.metrics.sparse_categorical_accuracy(y_true, y_pred)

    print("\n ...training model\n")
    model.compile(optimizer, loss=loss_function, metrics=[accuracy])
    history = model.fit(dataset,
                        epochs=params.epochs,
                        validation_data=test_dataset)

    print("\nSaving model weights, tokenizer and meta data\n")
    model.save('model_trained.h5')
    tokenizer.save_to_file('tokenizer')
    model.save_weights('model_weights.h5')

    # saving history and meta using pickle
    save_pickle(meta, 'meta')
    save_pickle(history.history, 'history')

    evaluate(model, tokenizer, meta)
def main(args):

    # get datasets
    dataset = data.get_dataset(args.dataset,
                               args.split,
                               image_size=args.image_size,
                               data_dir=args.data_dir,
                               is_training=True)

    im_x = preprocess(dataset.x,
                      args.preprocessing_a,
                      image_size=args.image_size,
                      output_channels=args.num_channels)
    im_y = preprocess(dataset.y,
                      args.preprocessing_b,
                      image_size=args.image_size)

    im_batch_x, im_batch_y = data.create_batch([im_x, im_y],
                                               batch_size=args.batch_size,
                                               shuffle=args.shuffle,
                                               queue_size=2,
                                               min_queue_size=1)

    # build models

    transformed_x = model.transformer(im_batch_x,
                                      output_channels=dataset.num_classes,
                                      output_fn=None,
                                      scope='model/AtoB')
    transformed_y = model.transformer(im_batch_y,
                                      output_channels=args.num_channels,
                                      scope='model/BtoA')

    cycled_x = model.transformer(tf.nn.softmax(transformed_x),
                                 output_channels=args.num_channels,
                                 scope='model/BtoA',
                                 reuse=True)
    cycled_y = model.transformer(transformed_y,
                                 output_channels=dataset.num_classes,
                                 output_fn=None,
                                 scope='model/AtoB',
                                 reuse=True)

    # create loss functions

    cycle_loss_x = tf.losses.absolute_difference(im_batch_x,
                                                 cycled_x,
                                                 scope='cycle_loss_x')
    cycle_loss_y = tf.losses.softmax_cross_entropy(im_batch_y,
                                                   cycled_y,
                                                   scope='cycle_loss_y')

    transform_loss_xy = tf.losses.absolute_difference(
        im_batch_x, transformed_y, scope='transform_loss_xy')
    transform_loss_yx = tf.losses.softmax_cross_entropy(
        im_batch_y, transformed_x, scope='transform_loss_yx')

    total_loss = cycle_loss_x + cycle_loss_y + transform_loss_xy + transform_loss_yx

    optimizer = tf.train.AdamOptimizer(args.learning_rate, args.beta1,
                                       args.beta2, args.epsilon)

    inc_global_step = tf.assign_add(tf.train.get_or_create_global_step(), 1)
    tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, inc_global_step)

    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(update_ops):
        train_tensor = optimizer.minimize(total_loss)

        # Set up train op to return loss
        with tf.control_dependencies([train_tensor]):
            train_op = tf.identity(total_loss, name='train_op')

    # set up logging

    # Gather initial summaries.
    summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES))

    # Add summaries for losses.
    for loss in tf.get_collection(tf.GraphKeys.LOSSES):
        summaries.add(tf.summary.scalar('losses/%s' % loss.op.name, loss))

    # Add summaries for variables.
    for variable in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES):
        summaries.add(tf.summary.histogram(variable.op.name, variable))

    color_map = np.array(
        list(map(lambda x: x.color,
                 labels[:dataset.num_classes]))).astype(np.float32)

    segmentation_y = postprocess(tf.argmax(im_batch_y,
                                           -1), 'segmentation_to_rgb',
                                 dataset.num_classes, color_map)
    segmentation_transformed_x = postprocess(tf.argmax(transformed_x, -1),
                                             'segmentation_to_rgb',
                                             dataset.num_classes, color_map)
    segmentation_cycled_y = postprocess(tf.argmax(cycled_y,
                                                  -1), 'segmentation_to_rgb',
                                        dataset.num_classes, color_map)

    summaries.add(tf.summary.image('x', im_batch_x))
    summaries.add(tf.summary.image('y', segmentation_y))
    summaries.add(tf.summary.image('transformed_x',
                                   segmentation_transformed_x))
    summaries.add(tf.summary.image('transformed_y', transformed_y))
    summaries.add(tf.summary.image('cycled_x', cycled_x))
    summaries.add(tf.summary.image('cycled_y', segmentation_cycled_y))

    # Merge all summaries together.
    summary_op = tf.summary.merge(list(summaries), name='summary_op')

    # create train loop

    if not os.path.isdir(args.output_dir):
        os.makedirs(args.output_dir)

    saver = tf.train.Saver(var_list=tf.get_collection(
        tf.GraphKeys.GLOBAL_VARIABLES, scope='model'))
    checkpoint_path = os.path.join(args.output_dir, 'model.ckpt')
    writer = tf.summary.FileWriter(args.output_dir)

    with tf.Session() as sess:
        # Tensorflow initializations
        sess.run(tf.get_collection(tf.GraphKeys.TABLE_INITIALIZERS))
        tf.train.start_queue_runners(sess=sess)
        sess.run(tf.global_variables_initializer())
        sess.run(tf.local_variables_initializer())

        last_log_time = 0
        last_save_time = 0
        for i in tqdm(range(args.num_batches)):
            if last_log_time < time.time() - args.log_every_n_seconds:
                last_log_time = time.time()
                summary, loss_val, global_step = sess.run(
                    [summary_op, train_op,
                     tf.train.get_global_step()])
                writer.add_summary(summary, global_step)
                writer.flush()
            else:
                loss_val, global_step = sess.run(
                    [train_op, tf.train.get_global_step()])

            if last_save_time < time.time() - args.save_every_n_seconds:
                last_save_time = time.time()
                saver.save(sess, checkpoint_path, global_step=global_step)

        saver.save(sess, checkpoint_path, global_step=args.num_batches)
    def model_fn(inpN, inpT, tgtN, tgtT, mems, is_training):
        inpN = tf.transpose(inpN, [1, 0])
        inpT = tf.transpose(inpT, [1, 0])
        tgtN = tf.transpose(tgtN, [1, 0])
        tgtT = tf.transpose(tgtT, [1, 0])

        if FLAGS.init == "uniform":
            initializer = tf.initializers.random_uniform(
                minval=-FLAGS.init_range,
                maxval=FLAGS.init_range,
                seed=None)
        elif FLAGS.init == "normal":
            initializer = tf.initializers.random_normal(
                stddev=FLAGS.init_std,
                seed=None)
            proj_initializer = tf.initializers.random_normal(
                stddev=FLAGS.proj_init_std,
                seed=None)

        tie_projs = [False for _ in range(len(cutoffs) + 1)]
        if FLAGS.proj_share_all_but_first:
            for i in range(1, len(tie_projs)):
                tie_projs[i] = True

        lossN, lossT, new_mems, predictionN, predictionT = model.transformer(
            inpN=inpN,
            inpT=inpT,
            targetsN=tgtN,
            targetsT=tgtT,
            mems=mems,
            n_token_N=n_token_N,
            n_token_T=n_token_T,
            n_layer=FLAGS.n_layer,
            d_model_N=FLAGS.d_model_N,
            d_model_T=FLAGS.d_model_T,
            d_embed_N=FLAGS.d_embed_N,
            d_embed_T=FLAGS.d_embed_T,
            n_head=FLAGS.n_head,
            d_head=FLAGS.d_head,
            d_inner=FLAGS.d_inner,
            dropout=FLAGS.dropout,
            dropatt=FLAGS.dropatt,
            initializer=initializer,
            proj_initializer=proj_initializer,
            is_training=is_training,
            mem_len=FLAGS.mem_len,
            cutoffs=cutoffs,
            div_val=FLAGS.div_val,
            tie_projs=tie_projs,
            input_perms=None,
            target_perms=None,
            head_target=None,
            same_length=FLAGS.same_length,
            clamp_len=FLAGS.clamp_len,
            use_tpu=False,
            untie_r=FLAGS.untie_r,
            proj_same_dim=FLAGS.proj_same_dim)

        # number of parameters
        num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()])
        tf.logging.info('#params: {}'.format(num_params))

        loss = tf.multiply(alpha, lossN) + tf.multiply(1 - alpha, lossT)
        # format_str = '{{:<{0}s}}\t{{}}'.format(
        #     max([len(v.name) for v in tf.trainable_variables()]))
        # for v in tf.trainable_variables():
        #   tf.logging.info(format_str.format(v.name, v.get_shape()))

        if is_training:
            all_vars = tf.trainable_variables()
            grads = tf.gradients(loss, all_vars)
            grads_and_vars = list(zip(grads, all_vars))

            return lossN, lossT, loss, new_mems, grads_and_vars, predictionN, predictionT
        else:
            return lossN, lossT, loss, new_mems, predictionN, predictionT
    def do_training(self, fleet, args):
        """
        begin training.
        Args:
            fleet (Collective): Collective inherited base class Fleet
            args (ArgumentParser): run args to config dist fleet.
        Returns:
            tuple: the value is train losses
        """
        args = parse_args()
        logging.info(args)
        gpu_id = int(os.environ.get('FLAGS_selected_gpus', 4))
        place = fluid.CUDAPlace(gpu_id)
        dev_count = 1
        exe = fluid.Executor(place)
        train_program = fluid.Program()
        startup_program = fluid.Program()
        args.num_trainers = fleet.worker_num()
        args.trainer_id = fleet.worker_index()
        args.run_params = json.loads(args.run_params)
        dist_strategy = DistributedStrategy()
        dist_strategy.enable_inplace = args.run_params['enable_inplace']
        dist_strategy.fuse_all_reduce_ops = args.run_params[
            'fuse_all_reduce_ops']
        dist_strategy.nccl_comm_num = args.run_params['nccl_comm_num']
        dist_strategy.use_local_sgd = args.run_params['use_local_sgd']
        dist_strategy.mode = args.run_params["mode"]
        dist_strategy.collective_mode = args.run_params["collective"]

        with fluid.program_guard(train_program, startup_program):
            with fluid.unique_name.guard():
                sum_cost, avg_cost, predict, token_num, pyreader = transformer(
                    ModelHyperParams.src_vocab_size,
                    ModelHyperParams.trg_vocab_size,
                    ModelHyperParams.max_length + 1,
                    ModelHyperParams.n_layer,
                    ModelHyperParams.n_head,
                    ModelHyperParams.d_key,
                    ModelHyperParams.d_value,
                    ModelHyperParams.d_model,
                    ModelHyperParams.d_inner_hid,
                    ModelHyperParams.prepostprocess_dropout,
                    ModelHyperParams.attention_dropout,
                    ModelHyperParams.relu_dropout,
                    ModelHyperParams.preprocess_cmd,
                    ModelHyperParams.postprocess_cmd,
                    ModelHyperParams.weight_sharing,
                    TrainTaskConfig.label_smooth_eps,
                    ModelHyperParams.bos_idx,
                    use_py_reader=args.use_py_reader,
                    is_test=False)
                optimizer = fluid.optimizer.SGD(0.003)
                if args.run_params["fp16"]:
                    optimizer = decorate(optimizer, init_loss_scaling=64.0)
                optimizer = fleet.distributed_optimizer(optimizer,
                                                        strategy=dist_strategy)
                optimizer.minimize(avg_cost, startup_program)
        train_program = fleet.main_program
        exe.run(startup_program)
        train_data = prepare_data_generator(
            args,
            is_test=False,
            count=dev_count,
            pyreader=pyreader,
            py_reader_provider_wrapper=py_reader_provider_wrapper)

        loss_normalizer = -(
            (1. - TrainTaskConfig.label_smooth_eps) * np.log(
                (1. - TrainTaskConfig.label_smooth_eps)) +
            TrainTaskConfig.label_smooth_eps *
            np.log(TrainTaskConfig.label_smooth_eps /
                   (ModelHyperParams.trg_vocab_size - 1) + 1e-20))

        step_idx = 0
        init_flag = True
        result_loss = []
        result_ppl = []
        train_info = []
        for pass_id in six.moves.xrange(args.num_epochs):
            pass_start_time = time.time()
            if args.use_py_reader:
                pyreader.start()
                data_generator = None
            else:
                data_generator = train_data()
            batch_id = 0
            while True:
                try:
                    feed_dict_list = prepare_feed_dict_list(
                        data_generator, init_flag, dev_count)
                    t1 = time.time()
                    outs = exe.run(program=train_program,
                                   fetch_list=[sum_cost.name, token_num.name]
                                   if step_idx % args.fetch_steps == 0 else [],
                                   feed=feed_dict_list)

                    if step_idx % args.fetch_steps == 0:
                        sum_cost_val, token_num_val = np.array(
                            outs[0]), np.array(outs[1])
                        total_sum_cost = sum_cost_val.sum()
                        total_token_num = token_num_val.sum()
                        total_avg_cost = total_sum_cost / total_token_num
                        result_loss.append(total_avg_cost - loss_normalizer)
                        result_ppl.append(
                            np.exp([min(total_avg_cost, 100)]).item(0))
                        train_info.append(result_loss)
                    init_flag = False
                    batch_id += 1
                    step_idx += 1
                    if batch_id >= 5:
                        break
                except (StopIteration, fluid.core.EOFException):
                    if args.use_py_reader:
                        pyreader.reset()
                    break

            train_info = [round(i, 6) for i in train_info[0]]
            return train_info
Example #9
0
def train(args):
    # priority: ENV > args > config
    is_local = os.getenv("PADDLE_IS_LOCAL", "1")
    if is_local == '0':
        args.local = False
    logging.info(args)

    if args.device == 'CPU':
        TrainTaskConfig.use_gpu = False

    training_role = os.getenv("TRAINING_ROLE", "TRAINER")

    if training_role == "PSERVER" or (not TrainTaskConfig.use_gpu):
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
    else:
        gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0))
        place = fluid.CUDAPlace(gpu_id)
        dev_count = get_device_num()
        # place = fluid.CUDAPlace(0)
        # dev_count = fluid.core.get_cuda_device_count()

    update_lr(TrainTaskConfig)

    exe = fluid.Executor(place)
    train_prog = fluid.Program()
    startup_prog = fluid.Program()

    if args.enable_ce:
        train_prog.random_seed = 1000
        startup_prog.random_seed = 1000

    with fluid.program_guard(train_prog, startup_prog):
        with fluid.unique_name.guard():
            sum_cost, avg_cost, predict, token_num, pyreader = transformer(
                ModelHyperParams.src_vocab_size,
                ModelHyperParams.trg_vocab_size,
                ModelHyperParams.max_length + 1,
                ModelHyperParams.n_layer,
                ModelHyperParams.n_head,
                ModelHyperParams.d_key,
                ModelHyperParams.d_value,
                ModelHyperParams.d_model,
                ModelHyperParams.d_inner_hid,
                ModelHyperParams.prepostprocess_dropout,
                ModelHyperParams.attention_dropout,
                ModelHyperParams.relu_dropout,
                ModelHyperParams.preprocess_cmd,
                ModelHyperParams.postprocess_cmd,
                ModelHyperParams.weight_sharing,
                TrainTaskConfig.label_smooth_eps,
                use_py_reader=args.use_py_reader,
                is_test=False)

            optimizer = None
            if args.sync:
                lr_decay = fluid.layers.learning_rate_scheduler.noam_decay(
                    ModelHyperParams.d_model, TrainTaskConfig.warmup_steps)
                logging.info("before adam")

                with fluid.default_main_program()._lr_schedule_guard():
                    learning_rate = lr_decay * TrainTaskConfig.learning_rate

                optimizer = fluid.optimizer.Adam(learning_rate=learning_rate,
                                                 beta1=TrainTaskConfig.beta1,
                                                 beta2=TrainTaskConfig.beta2,
                                                 epsilon=TrainTaskConfig.eps)
            else:
                optimizer = fluid.optimizer.SGD(0.003)
            optimizer.minimize(avg_cost)

    if args.use_mem_opt:
        pass
        # fluid.memory_optimize(train_prog)

    if args.local:
        logging.info("local start_up:")
        train_loop(exe, train_prog, startup_prog, dev_count, sum_cost,
                   avg_cost, token_num, predict, pyreader)
    else:
        print("This script cannot run in distributed mode.")
        sys.exit(0)
        if args.update_method == "nccl2":
            trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
            port = os.getenv("PADDLE_PORT")
            worker_ips = os.getenv("PADDLE_TRAINERS")
            worker_endpoints = []
            for ip in worker_ips.split(","):
                worker_endpoints.append(':'.join([ip, port]))
            trainers_num = len(worker_endpoints)
            current_endpoint = os.getenv("POD_IP") + ":" + port
            if trainer_id == 0:
                logging.info("train_id == 0, sleep 60s")
                time.sleep(60)
            logging.info("trainers_num:{}".format(trainers_num))
            logging.info("worker_endpoints:{}".format(worker_endpoints))
            logging.info("current_endpoint:{}".format(current_endpoint))
            append_nccl2_prepare(startup_prog, trainer_id, worker_endpoints,
                                 current_endpoint)
            train_loop(exe, train_prog, startup_prog, dev_count, sum_cost,
                       avg_cost, token_num, predict, pyreader, trainers_num,
                       trainer_id)
            return

        port = os.getenv("PADDLE_PORT", "6174")
        pserver_ips = os.getenv("PADDLE_PSERVERS")  # ip,ip...
        eplist = []
        for ip in pserver_ips.split(","):
            eplist.append(':'.join([ip, port]))
        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
        trainers = int(os.getenv("PADDLE_TRAINERS_NUM", "0"))
        current_endpoint = os.getenv("POD_IP") + ":" + port
        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))

        logging.info("pserver_endpoints:{}".format(pserver_endpoints))
        logging.info("current_endpoint:{}".format(current_endpoint))
        logging.info("trainer_id:{}".format(trainer_id))
        logging.info("pserver_ips:{}".format(pserver_ips))
        logging.info("port:{}".format(port))

        t = fluid.DistributeTranspiler()
        t.transpile(trainer_id,
                    pservers=pserver_endpoints,
                    trainers=trainers,
                    program=train_prog,
                    startup_program=startup_prog)

        if training_role == "PSERVER":
            logging.info("distributed: pserver started")
            current_endpoint = os.getenv("POD_IP") + ":" + os.getenv(
                "PADDLE_PORT")
            if not current_endpoint:
                logging.critical("need env SERVER_ENDPOINT")
                exit(1)
            pserver_prog = t.get_pserver_program(current_endpoint)
            pserver_startup = t.get_startup_program(current_endpoint,
                                                    pserver_prog)

            exe.run(pserver_startup)
            exe.run(pserver_prog)
        elif training_role == "TRAINER":
            logging.info("distributed: trainer started")
            trainer_prog = t.get_trainer_program()

            train_loop(exe, train_prog, startup_prog, dev_count, sum_cost,
                       avg_cost, token_num, predict, pyreader)
        else:
            logging.critical(
                "environment var TRAINER_ROLE should be TRAINER os PSERVER")
            exit(1)
Example #10
0
def main(args):
    train_prog = fluid.Program()
    startup_prog = fluid.Program()
    train_prog.random_seed = 1000
    startup_prog.random_seed = 1000
    with fluid.program_guard(train_prog, startup_prog):
        with fluid.unique_name.guard():
            sum_cost, avg_cost, predict, token_num, pyreader = transformer(
                ModelHyperParams.src_vocab_size,
                ModelHyperParams.trg_vocab_size,
                ModelHyperParams.max_length + 1,
                ModelHyperParams.n_layer,
                ModelHyperParams.n_head,
                ModelHyperParams.d_key,
                ModelHyperParams.d_value,
                ModelHyperParams.d_model,
                ModelHyperParams.d_inner_hid,
                ModelHyperParams.prepostprocess_dropout,
                ModelHyperParams.attention_dropout,
                ModelHyperParams.relu_dropout,
                ModelHyperParams.preprocess_cmd,
                ModelHyperParams.postprocess_cmd,
                ModelHyperParams.weight_sharing,
                TrainTaskConfig.label_smooth_eps,
                use_py_reader=args.use_py_reader,
                is_test=False)
            lr_decay = fluid.layers.learning_rate_scheduler.noam_decay(
                ModelHyperParams.d_model, TrainTaskConfig.warmup_steps)
            optimizer = fluid.optimizer.Adam(
                learning_rate=lr_decay * TrainTaskConfig.learning_rate,
                beta1=TrainTaskConfig.beta1,
                beta2=TrainTaskConfig.beta2,
                epsilon=TrainTaskConfig.eps)
            optimizer.minimize(avg_cost)

    if args.use_mem_opt:
        fluid.memory_optimize(train_prog)

    if TrainTaskConfig.use_gpu:
        place = fluid.CUDAPlace(0)
        dev_count = fluid.core.get_cuda_device_count()
    else:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
    exe = fluid.Executor(place)
    # Initialize the parameters.
    if TrainTaskConfig.ckpt_path:
        fluid.io.load_persistables(exe, TrainTaskConfig.ckpt_path)
    else:
        exe.run(startup_prog)

    exec_strategy = fluid.ExecutionStrategy()
    # For faster executor
    exec_strategy.use_experimental_executor = True
    exec_strategy.num_iteration_per_drop_scope = 5
    build_strategy = fluid.BuildStrategy()
    # Since the token number differs among devices, customize gradient scale to
    # use token average cost among multi-devices. and the gradient scale is
    # `1 / token_number` for average cost.
    build_strategy.gradient_scale_strategy = fluid.BuildStrategy.GradientScaleStrategy.Customized
    train_exe = fluid.ParallelExecutor(
        use_cuda=TrainTaskConfig.use_gpu,
        loss_name=avg_cost.name,
        main_program=train_prog,
        build_strategy=build_strategy,
        exec_strategy=exec_strategy)

    # the best cross-entropy value with label smoothing
    loss_normalizer = -((1. - TrainTaskConfig.label_smooth_eps) * np.log(
        (1. - TrainTaskConfig.label_smooth_eps
         )) + TrainTaskConfig.label_smooth_eps *
                        np.log(TrainTaskConfig.label_smooth_eps / (
                            ModelHyperParams.trg_vocab_size - 1) + 1e-20))

    train_data = prepare_data_generator(
        args, is_test=False, count=dev_count, pyreader=pyreader)
    if args.use_py_reader:
        pyreader.start()
        data_generator = None
    else:
        data_generator = train_data()

    def run(iter_num):
        reader_time = []
        run_time = []

        for step_idx in six.moves.xrange(iter_num):
            try:
                start_time = time.time()
                feed_dict_list = prepare_feed_dict_list(data_generator,
                                                        init_flag, dev_count)
                end_time = time.time()
                reader_time.append(end_time - start_time)

                start_time = time.time()
                if args.use_parallel_exe:
                    outs = train_exe.run(
                        fetch_list=[sum_cost.name, token_num.name],
                        feed=feed_dict_list)
                else:
                    outs = exe.run(program=train_prog,
                                   fetch_list=[sum_cost.name, token_num.name],
                                   feed=feed_dict_list[0]
                                   if feed_dict_list is not None else None)
                end_time = time.time()
                run_time.append(end_time - start_time)

                sum_cost_val, token_num_val = np.array(outs[0]), np.array(outs[
                    1])
                # sum the cost from multi-devices
                total_sum_cost = sum_cost_val.sum()
                total_token_num = token_num_val.sum()
                total_avg_cost = total_sum_cost / total_token_num
                print("step_idx: %d, avg loss: %f, "
                      "normalized loss: %f, ppl: %f" %
                      (step_idx, total_avg_cost,
                       total_avg_cost - loss_normalizer,
                       np.exp([min(total_avg_cost, 100)])))
            except (StopIteration, fluid.core.EOFException):
                # The current pass is over.
                if args.use_py_reader:
                    pyreader.reset()
                    pyreader.start()

        return reader_time, run_time

    @contextlib.contextmanager
    def profile_context(profile=True):
        if profile:
            with profiler.profiler('All', 'total', '/tmp/profile_file'):
                yield
        else:
            yield

    # start-up
    init_flag = True
    run(5)
    init_flag = False

    # profiling
    start = time.time()
    # currently only support profiling on one device
    with profile_context(args.profile_ops):
        reader_time, run_time = run(args.iter_num)
    end = time.time()
    total_time = end - start
    print(
        "Total time: {0}, reader time: {1} s, run time: {2} s, step number: {3}".
        format(total_time, np.sum(reader_time), np.sum(run_time),
               args.iter_num))
Example #11
0
import gradio as gr
import tensorflow as tf
import tensorflow_datasets as tfds
from model import transformer
from main import predict
import pickle

with open('pretrained_weights/meta.pickle', 'rb') as handle:
    meta = pickle.load(handle)

tokenizer = tfds.features.text.SubwordTextEncoder.load_from_file(
    'pretrained_weights/tokenizer')

model = transformer(d_model=256,
                    vocab_size=meta['vocab_size'],
                    num_layers=4,
                    num_heads=8,
                    dff=1024)

model.load_weights('pretrained_weights/transformer_weights.h5')


def chatbot(sentence):
    print('\nQ:', sentence)
    resutl = predict(model, tokenizer, sentence, meta)
    print('\nA:', resutl)
    return resutl


gr.Interface(chatbot, inputs="text", outputs="text").launch(share=True)
Example #12
0
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

tf.keras.backend.clear_session()

# Hyper-parameters
NUM_LAYERS = 4
D_MODEL = 312
NUM_HEADS = 8
UNITS = 768
DROPOUT = 0.2

model = transformer(
    vocab_size=VOCAB_SIZE,
    num_layers=NUM_LAYERS,
    units=UNITS,
    d_model=D_MODEL,
    num_heads=NUM_HEADS,
    dropout=DROPOUT)


def loss_function(y_true, y_pred):
  y_true = tf.reshape(y_true, shape=(-1, preprocessor.MAX_LENGTH - 1))

  loss = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')(y_true, y_pred)

  mask = tf.cast(tf.not_equal(y_true, 0), tf.float32)
  loss = tf.multiply(loss, mask)

  return tf.reduce_mean(loss)
Example #13
0
def main():
    """
    model train
    """
    is_local = os.getenv("PADDLE_IS_LOCAL", "0")
    if is_local == '0':
        args.local = False
    else:
        args.local = True
    # init
    place = fluid.CUDAPlace(0) if args.device == 'GPU' else fluid.CPUPlace()
    training_role = os.getenv("TRAINING_ROLE", "TRAINER")
    if training_role == "PSERVER":
        place = fluid.CPUPlace()
    exe = fluid.Executor(place)

    sum_cost, avg_cost, predict, token_num = transformer(
        ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size,
        ModelHyperParams.max_length + 1, ModelHyperParams.n_layer,
        ModelHyperParams.n_head, ModelHyperParams.d_key,
        ModelHyperParams.d_value, ModelHyperParams.d_model,
        ModelHyperParams.d_inner_hid, ModelHyperParams.dropout,  
        ModelHyperParams.src_pad_idx,  ModelHyperParams.trg_pad_idx,
        ModelHyperParams.pos_pad_idx)

    warmup_steps = get_var("warmup_steps", value=TrainTaskConfig.warmup_steps)
    d_model = get_var("d_model", value=ModelHyperParams.d_model)

    lr_decay = fluid.layers\
        .learning_rate_scheduler\
        .noam_decay(d_model, warmup_steps)

    optimizer = fluid.optimizer.Adam(
        learning_rate = lr_decay,
        beta1=TrainTaskConfig.beta1,
        beta2=TrainTaskConfig.beta2,
        epsilon=TrainTaskConfig.eps)
    optimize_ops, params_grads = optimizer.minimize(avg_cost if TrainTaskConfig.use_avg_cost else sum_cost)


    # Program to do validation.
    inference_program = fluid.default_main_program().clone()
    with fluid.program_guard(inference_program):
        inference_program = fluid.io.get_inference_program([avg_cost])

    def test(exe):
        test_total_cost = 0
        test_total_token = 0
        for batch_id, data in enumerate(test_reader()):
            data_input = prepare_batch_input(
                data, encoder_input_data_names + decoder_input_data_names[:-1] +
                label_data_names, ModelHyperParams.eos_idx,
                ModelHyperParams.eos_idx, ModelHyperParams.n_head,
                ModelHyperParams.d_model)
            test_sum_cost, test_token_num = exe.run(
                inference_program,
                feed=data_input,
                fetch_list=[sum_cost, token_num],
                use_program_cache=True)
            test_total_cost += test_sum_cost
            test_total_token += test_token_num
        test_avg_cost = test_total_cost / test_total_token
        test_ppl = np.exp([min(test_avg_cost, 100)])
        return test_avg_cost, test_ppl

    def train_loop(exe, trainer_prog):
        for pass_id in xrange(args.pass_num):
            ts = time.time()
            total = 0
            pass_start_time = time.time()
            #print len(train_reader)
            for batch_id, data in enumerate(train_reader):
                #print len(data)
                if len(data) != args.batch_size:
                    continue

                total += len(data)
                start_time = time.time()
                data_input = prepare_batch_input(
                    data, encoder_input_data_names + decoder_input_data_names[:-1] +
                    label_data_names, ModelHyperParams.eos_idx,
                    ModelHyperParams.eos_idx, ModelHyperParams.n_head,
                    ModelHyperParams.d_model)
                
                outs = exe.run(trainer_prog,
                               feed=data_input,
                               fetch_list=[sum_cost, avg_cost],
                               use_program_cache=True)
                sum_cost_val, avg_cost_val = np.array(outs[0]), np.array(outs[1])
                print("epoch: %d, batch: %d, sum loss: %f, avg loss: %f, ppl: %f, speed: %.2f" %
                      (pass_id, batch_id, sum_cost_val, avg_cost_val,
                       np.exp([min(avg_cost_val[0], 100)]), 
                       len(data) / (time.time() - start_time)))

                if args.test_save:
                    if batch_id == args.exit_batch_id:
                        print("batch_id: %d exit!" % batch_id)
                        break

            # Validate and save the model for inference.
            # val_avg_cost, val_ppl = test(exe)
            val_avg_cost, val_ppl = 0,0
            pass_end_time = time.time()
            time_consumed = pass_end_time - pass_start_time
            print("pass_id = %s time_consumed = %s val_avg_cost=%f val_ppl=%f speed: %.2f" % \
                  (str(pass_id), str(time_consumed), \
                     val_avg_cost, val_ppl, total / (time.time() - ts)))

            fluid.io.save_inference_model(
                os.path.join(args.model_path,
                             "pass_" + str(pass_id) + "_" + str(args.task_index) + ".infer.model"),
                encoder_input_data_names + decoder_input_data_names[:-1],
                [predict], exe)

            if args.test_save:
                break

    if args.local:
        # Initialize the parameters.
        print("local start_up:")
        exe.run(fluid.framework.default_startup_program())
        #print(debuger.pprint_program_codes(fluid.framework.default_startup_program()))
        for pos_enc_param_name in pos_enc_param_names:
            #print("pos_enc_param_name:", pos_enc_param_name)
            pos_enc_param = fluid.global_scope().find_var(
                pos_enc_param_name).get_tensor()
            pos_enc_param.set(
                position_encoding_init(ModelHyperParams.max_length + 1,
                                       ModelHyperParams.d_model), place)
         
        #print "./nist06n/data-%d/part-*" % (args.task_index),
        train_reader = data_util.DataLoader(
          src_vocab_fpath="./thirdparty/nist06n/cn_30001.dict",
          trg_vocab_fpath="./thirdparty/nist06n/en_30001.dict",
          fpattern="./train/*" % (args.task_index),
          batch_size=args.batch_size,
          token_batch_size=TrainTaskConfig.token_batch_size,
          sort_by_length=TrainTaskConfig.sort_by_length,
          shuffle=True)

        train_loop(exe, fluid.default_main_program())
    else:
        port = os.getenv("PADDLE_PORT", "6174")
        pserver_ips = os.getenv("PADDLE_PSERVERS")  # ip,ip...
        eplist = []
        for ip in pserver_ips.split(","):
            eplist.append(':'.join([ip, port]))
        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
        trainers = int(os.getenv("PADDLE_TRAINERS_NUM", "0"))
        current_endpoint = os.getenv("POD_IP") + ":" + port
        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
        t = fluid.DistributeTranspiler()
        t.transpile(
            trainer_id,
            pservers=pserver_endpoints,
            trainers=trainers)
             
        if training_role == "PSERVER":
            current_endpoint = os.getenv("POD_IP") + ":" + os.getenv(
                "PADDLE_PORT")
            if not current_endpoint:
                print("need env SERVER_ENDPOINT")
                exit(1)
            pserver_prog = t.get_pserver_program(current_endpoint)
            pserver_startup = t.get_startup_program(current_endpoint,
                                                    pserver_prog)

            if args.save_graph: 
                block_no=0
                for t in pserver_startup.blocks: 
                    block_name="pserver_startup_block_%04d" % block_no
                    print block_name
                    print(debuger.draw_block_graphviz(t, path="./" + block_name+".dot"))
                    block_no+=1

                block_no=0
                for t in pserver_prog.blocks:
                    block_name="pserver_prog_block_%04d" % block_no
                    print(debuger.draw_block_graphviz(t, path="./" + block_name+".dot"))
                    block_no+=1

            print "begin run"
            exe.run(pserver_startup)#, save_program_to_file="./pserver_startup.desc")
            exe.run(pserver_prog)#, save_program_to_file="./pserver_loop.desc")
        elif training_role == "TRAINER":
            # Parameter initialization
            exe.run(fluid.default_startup_program())

            #print("cluster start_up:")

            for pos_enc_param_name in pos_enc_param_names:
                #print("pos_enc_param_name:", pos_enc_param_name)
                pos_enc_param = fluid.global_scope().find_var(
                    pos_enc_param_name).get_tensor()
                pos_enc_param.set(
                    position_encoding_init(ModelHyperParams.max_length + 1,
                                           ModelHyperParams.d_model), place)

            train_reader = data_util.DataLoader(
                src_vocab_fpath="./thirdparty/nist06n/cn_30001.dict",
                trg_vocab_fpath="./thirdparty/nist06n/en_30001.dict",
                fpattern="./train/part-*",
                batch_size=args.batch_size,
                token_batch_size=TrainTaskConfig.token_batch_size,
                sort_by_length=TrainTaskConfig.sort_by_length,
                shuffle=True)

            
            trainer_prog = t.get_trainer_program()
            train_loop(exe, trainer_prog)
        else:
            print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
Example #14
0
def train(args):
    """train start"""
    logging.info(args)

    gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0))
    place = fluid.CUDAPlace(gpu_id)
    dev_count = 1

    exe = fluid.Executor(place)

    train_program = fluid.Program()
    startup_program = fluid.Program()

    # For Distributed Training.
    role = role_maker.PaddleCloudRoleMaker(is_collective=True)
    fleet.init(role)
    args.num_trainers = fleet.worker_num()
    args.trainer_id = fleet.worker_index()
    dist_strategy = DistributedStrategy()

    with fluid.program_guard(train_program, startup_program):
        with fluid.unique_name.guard():
            sum_cost, avg_cost, predict, token_num, pyreader = transformer(
                ModelHyperParams.src_vocab_size,
                ModelHyperParams.trg_vocab_size,
                ModelHyperParams.max_length + 1,
                ModelHyperParams.n_layer,
                ModelHyperParams.n_head,
                ModelHyperParams.d_key,
                ModelHyperParams.d_value,
                ModelHyperParams.d_model,
                ModelHyperParams.d_inner_hid,
                ModelHyperParams.prepostprocess_dropout,
                ModelHyperParams.attention_dropout,
                ModelHyperParams.relu_dropout,
                ModelHyperParams.preprocess_cmd,
                ModelHyperParams.postprocess_cmd,
                ModelHyperParams.weight_sharing,
                TrainTaskConfig.label_smooth_eps,
                ModelHyperParams.bos_idx,
                use_py_reader=args.use_py_reader,
                is_test=False)

            optimizer = None
            if args.sync:
                lr_decay = fluid.layers.learning_rate_scheduler.noam_decay(
                    ModelHyperParams.d_model, TrainTaskConfig.warmup_steps)

                with fluid.default_main_program()._lr_schedule_guard():
                    learning_rate = lr_decay * TrainTaskConfig.learning_rate

                optimizer = fluid.optimizer.Adam(learning_rate=learning_rate,
                                                 beta1=TrainTaskConfig.beta1,
                                                 beta2=TrainTaskConfig.beta2,
                                                 epsilon=TrainTaskConfig.eps)
            else:
                optimizer = fluid.optimizer.SGD(0.003)
            if args.use_fp16:
                optimizer = decorate(optimizer,
                                     init_loss_scaling=args.loss_scaling)
            optimizer = fleet.distributed_optimizer(optimizer,
                                                    strategy=dist_strategy)
            optimizer.minimize(avg_cost, startup_program)

    train_program = fleet.main_program
    orig_train_program = fleet._origin_program
    train_loop(args, exe, train_program, orig_train_program, startup_program,
               dev_count, sum_cost, avg_cost, token_num, predict, pyreader)
def get_logits(input_ids, mems, input_mask, target_mask):
    """Builds the graph for calculating the final logits"""
    is_training = False

    cutoffs = []
    train_bin_sizes = []
    eval_bin_sizes = []
    proj_share_all_but_first = True
    n_token = FLAGS.n_token

    batch_size = FLAGS.batch_size

    features = {"input": input_ids}
    inp = tf.transpose(features["input"], [1, 0])
    input_mask = tf.transpose(input_mask, [1, 0])
    target_mask = tf.transpose(target_mask, [1, 0])
    tgt = None

    inp_perms, tgt_perms, head_tgt = None, None, None

    if FLAGS.init == "uniform":
        initializer = tf.initializers.random_uniform(minval=-FLAGS.init_range,
                                                     maxval=FLAGS.init_range,
                                                     seed=None)
    elif FLAGS.init == "normal":
        initializer = tf.initializers.random_normal(stddev=FLAGS.init_std,
                                                    seed=None)
        proj_initializer = tf.initializers.random_normal(
            stddev=FLAGS.proj_init_std, seed=None)

    tie_projs = [False for _ in range(len(cutoffs) + 1)]
    if proj_share_all_but_first:
        for i in range(1, len(tie_projs)):
            tie_projs[i] = True

    tf.logging.info("Vocab size : {}".format(n_token))
    tf.logging.info("Batch size : {}".format(batch_size))

    logits, new_mems = model.transformer(dec_inp=inp,
                                         target=tgt,
                                         mems=mems,
                                         n_token=n_token,
                                         n_layer=FLAGS.n_layer,
                                         d_model=FLAGS.d_model,
                                         d_embed=FLAGS.d_embed,
                                         n_head=FLAGS.n_head,
                                         d_head=FLAGS.d_head,
                                         d_inner=FLAGS.d_inner,
                                         dropout=0,
                                         dropatt=0,
                                         initializer=initializer,
                                         is_training=is_training,
                                         mem_len=FLAGS.seq_len +
                                         FLAGS.max_decode_length,
                                         cutoffs=cutoffs,
                                         div_val=1,
                                         tie_projs=tie_projs,
                                         input_perms=inp_perms,
                                         target_perms=tgt_perms,
                                         head_target=head_tgt,
                                         same_length=FLAGS.same_length,
                                         clamp_len=FLAGS.clamp_len,
                                         use_tpu=FLAGS.use_tpu,
                                         untie_r=FLAGS.untie_r,
                                         proj_same_dim=True,
                                         bidirectional_mask=FLAGS.bi_mask,
                                         infer=True,
                                         target_mask=target_mask,
                                         input_mask=input_mask,
                                         tgt_len=1)

    return logits, new_mems
Example #16
0
def main():
    place = fluid.CUDAPlace(0) if TrainTaskConfig.use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)

    sum_cost, avg_cost, predict, token_num = transformer(
        ModelHyperParams.src_vocab_size + 0,
        ModelHyperParams.trg_vocab_size + 0, ModelHyperParams.max_length + 1,
        ModelHyperParams.n_layer, ModelHyperParams.n_head,
        ModelHyperParams.d_key, ModelHyperParams.d_value,
        ModelHyperParams.d_model, ModelHyperParams.d_inner_hid,
        ModelHyperParams.dropout, ModelHyperParams.src_pad_idx,
        ModelHyperParams.trg_pad_idx, ModelHyperParams.pos_pad_idx)

    lr_scheduler = LearningRateScheduler(ModelHyperParams.d_model,
                                         TrainTaskConfig.warmup_steps, place,
                                         TrainTaskConfig.learning_rate)
    optimizer = fluid.optimizer.Adam(learning_rate=lr_scheduler.learning_rate,
                                     beta1=TrainTaskConfig.beta1,
                                     beta2=TrainTaskConfig.beta2,
                                     epsilon=TrainTaskConfig.eps)
    optimizer.minimize(avg_cost if TrainTaskConfig.use_avg_cost else sum_cost)

    train_data = paddle.batch(paddle.reader.shuffle(nist_data_provider.train(
        "data", ModelHyperParams.src_vocab_size,
        ModelHyperParams.trg_vocab_size),
                                                    buf_size=100000),
                              batch_size=TrainTaskConfig.batch_size)

    # Initialize the parameters.
    exe.run(fluid.framework.default_startup_program())
    for pos_enc_param_name in pos_enc_param_names:
        pos_enc_param = fluid.global_scope().find_var(
            pos_enc_param_name).get_tensor()
        pos_enc_param.set(
            position_encoding_init(ModelHyperParams.max_length + 1,
                                   ModelHyperParams.d_model), place)

    for pass_id in xrange(TrainTaskConfig.pass_num):
        pass_start_time = time.time()
        for batch_id, data in enumerate(train_data()):
            data_input = prepare_batch_input(
                data, encoder_input_data_names +
                decoder_input_data_names[:-1] + label_data_names,
                ModelHyperParams.src_pad_idx, ModelHyperParams.trg_pad_idx,
                ModelHyperParams.n_head, ModelHyperParams.d_model)
            lr_scheduler.update_learning_rate(data_input)
            outs = exe.run(fluid.framework.default_main_program(),
                           feed=data_input,
                           fetch_list=[sum_cost, avg_cost],
                           use_program_cache=True)
            sum_cost_val, avg_cost_val = np.array(outs[0]), np.array(outs[1])
            print("epoch: %d, batch: %d, sum loss: %f, avg loss: %f, ppl: %f" %
                  (pass_id, batch_id, sum_cost_val, avg_cost_val,
                   np.exp([min(avg_cost_val[0], 100)])))
        pass_end_time = time.time()
        time_consumed = pass_end_time - pass_start_time
        print("pass_id = " + str(pass_id) + " time_consumed = " +
              str(time_consumed))
        fluid.io.save_inference_model(
            os.path.join(TrainTaskConfig.model_dir,
                         "pass_" + str(pass_id) + ".infer.model"),
            encoder_input_data_names + decoder_input_data_names[:-1],
            [predict], exe)
Example #17
0
def main(args):

    # get datasets
    source_dataset = data.get_dataset(args.source, args.split)
    target_dataset = data.get_dataset(args.target, args.split)

    im_s = preprocess(source_dataset.x,
                      args.preprocessing,
                      image_size=args.image_size,
                      output_channels=args.output_channels)
    label_s = source_dataset.y

    im_t = preprocess(target_dataset.x,
                      args.preprocessing,
                      image_size=args.image_size,
                      output_channels=args.output_channels)
    label_t = target_dataset.y

    im_batch_s, label_batch_s, im_batch_t, label_batch_t = data.create_batch(
        [im_s, label_s, im_t, label_t],
        batch_size=args.batch_size,
        shuffle=args.shuffle)

    # build models

    transformed_s = model.transformer(im_batch_s, scope='model/s_to_t')
    transformed_t = model.transformer(im_batch_t, scope='model/t_to_s')

    cycled_s = model.transformer(transformed_s,
                                 scope='model/t_to_s',
                                 reuse=True)
    cycled_t = model.transformer(transformed_t,
                                 scope='model/s_to_t',
                                 reuse=True)

    # create loss functions

    cycle_loss_s = tf.losses.absolute_difference(im_batch_s,
                                                 cycled_s,
                                                 scope='cycle_loss_s')
    cycle_loss_t = tf.losses.absolute_difference(im_batch_t,
                                                 cycled_t,
                                                 scope='cycle_loss_t')

    total_loss = cycle_loss_s + cycle_loss_t

    optimizer = tf.train.AdamOptimizer(args.learning_rate, args.beta1,
                                       args.beta2, args.epsilon)

    inc_global_step = tf.assign_add(tf.train.get_or_create_global_step(), 1)
    tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, inc_global_step)

    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(update_ops):
        train_tensor = optimizer.minimize(total_loss)

        # Set up train op to return loss
        with tf.control_dependencies([train_tensor]):
            train_op = tf.identity(total_loss, name='train_op')

    # set up logging

    # Gather initial summaries.
    summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES))

    # Add summaries for losses.
    for loss in tf.get_collection(tf.GraphKeys.LOSSES):
        summaries.add(tf.summary.scalar('losses/%s' % loss.op.name, loss))

    # Add summaries for variables.
    for variable in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES):
        summaries.add(tf.summary.histogram(variable.op.name, variable))

    summaries.add(tf.summary.image('source', im_batch_s))
    summaries.add(tf.summary.image('target', im_batch_t))
    summaries.add(tf.summary.image('source_transformed', transformed_s))
    summaries.add(tf.summary.image('target_transformed', transformed_t))
    summaries.add(tf.summary.image('source_cycled', cycled_s))
    summaries.add(tf.summary.image('target_cycled', cycled_t))

    # Merge all summaries together.
    summary_op = tf.summary.merge(list(summaries), name='summary_op')

    # create train loop

    if not os.path.isdir(args.output_dir):
        os.makedirs(args.output_dir)

    saver = tf.train.Saver(var_list=tf.get_collection(
        tf.GraphKeys.GLOBAL_VARIABLES, scope='model'))
    checkpoint_path = os.path.join(args.output_dir, 'model.ckpt')
    writer = tf.summary.FileWriter(args.output_dir)

    with tf.Session() as sess:
        # Tensorflow initializations
        sess.run(tf.get_collection(tf.GraphKeys.TABLE_INITIALIZERS))
        tf.train.start_queue_runners(sess=sess)
        sess.run(tf.global_variables_initializer())
        sess.run(tf.local_variables_initializer())

        last_log_time = 0
        last_save_time = 0
        for i in tqdm(range(args.num_batches)):
            if last_log_time < time.time() - args.log_every_n_seconds:
                last_log_time = time.time()
                summary, loss_val, global_step = sess.run(
                    [summary_op, train_op,
                     tf.train.get_global_step()])
                writer.add_summary(summary, global_step)
                writer.flush()
            else:
                loss_val, global_step = sess.run(
                    [train_op, tf.train.get_global_step()])

            if last_save_time < time.time() - args.save_every_n_seconds:
                last_save_time = time.time()
                saver.save(sess, checkpoint_path, global_step=global_step)

        saver.save(sess, checkpoint_path, global_step=args.num_batches)
    def model_fn(inp, tgt, mems, is_training):
        inp = tf.transpose(inp, [1, 0])
        tgt = tf.transpose(tgt, [1, 0])

        if FLAGS.init == "uniform":
            initializer = tf.initializers.random_uniform(
                minval=-FLAGS.init_range, maxval=FLAGS.init_range, seed=None)
        elif FLAGS.init == "normal":
            initializer = tf.initializers.random_normal(stddev=FLAGS.init_std,
                                                        seed=None)
            proj_initializer = tf.initializers.random_normal(
                stddev=FLAGS.proj_init_std, seed=None)

        tie_projs = [False for _ in range(len(cutoffs) + 1)]
        if FLAGS.proj_share_all_but_first:
            for i in range(1, len(tie_projs)):
                tie_projs[i] = True

        reduce_loss = True
        neg_log_probs, new_mems = model.transformer(
            dec_inp=inp,
            target=tgt,
            mems=mems,
            n_token=n_token,
            n_layer=FLAGS.n_layer,
            d_model=FLAGS.d_model,
            d_embed=FLAGS.d_embed,
            n_head=FLAGS.n_head,
            d_head=FLAGS.d_head,
            d_inner=FLAGS.d_inner,
            dropout=FLAGS.dropout,
            dropatt=FLAGS.dropatt,
            initializer=initializer,
            proj_initializer=proj_initializer,
            is_training=is_training,
            mem_len=FLAGS.mem_len,
            cutoffs=cutoffs,
            div_val=FLAGS.div_val,
            tie_projs=tie_projs,
            input_perms=None,
            target_perms=None,
            head_target=None,
            same_length=FLAGS.same_length,
            clamp_len=FLAGS.clamp_len,
            use_tpu=False,
            untie_r=FLAGS.untie_r,
            proj_same_dim=FLAGS.proj_same_dim,
            reduce_loss=reduce_loss)
        if reduce_loss:
            loss = neg_log_probs
            neg_log_probs = []
        else:
            loss = tf.reduce_mean(neg_log_probs)
        # number of parameters
        num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()])
        tf.logging.info('#params: {}'.format(num_params))

        # format_str = '{{:<{0}s}}\t{{}}'.format(
        #     max([len(v.name) for v in tf.trainable_variables()]))
        # for v in tf.trainable_variables():
        #   tf.logging.info(format_str.format(v.name, v.get_shape()))
        print("neg log loss", neg_log_probs)
        if is_training:
            all_vars = tf.trainable_variables()
            grads = tf.gradients(loss, all_vars)
            grads_and_vars = list(zip(grads, all_vars))

            return loss, new_mems, grads_and_vars, neg_log_probs
        else:

            return loss, new_mems, neg_log_probs
        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)

        self.warmup_steps = warmup_steps

    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps**-1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)


dataset, other_tuple = create_data()
model = transformer(vocab_size=other_tuple[0],
                    num_layers=NUM_LAYERS,
                    units=UNITS,
                    model=MODEL,
                    num_heads=NUM_HEADS,
                    dropout=DROPOUT)

learning_rate = CustomSchedule(d_model=128)
opt = tf.keras.optimizers.Adam(learning_rate,
                               beta_1=0.9,
                               beta_2=0.98,
                               epsilon=1e-9)


def accuracy(y_true, y_pred):
    # first make sure both have the same length (b_size, MAX_LENGTH -1)
    y_true = tf.reshape(y_true, shape=(-1, MAX_LENGTH - 1))
    return tf.keras.metrics.sparse_categorical_accuracy(y_true, y_pred)
Example #20
0
def train(args):
    # priority: ENV > args > config
    is_local = os.getenv("PADDLE_IS_LOCAL", "1")
    if is_local == '0':
        args.local = False
    logging.info("args:{}".format(args))

    if args.device == 'CPU':
        TrainTaskConfig.use_gpu = False

    training_role = os.getenv("TRAINING_ROLE", "TRAINER")

    if training_role == "PSERVER" or (not TrainTaskConfig.use_gpu):
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
    else:
        place = fluid.CUDAPlace(0)
        dev_count = fluid.core.get_cuda_device_count()

    exe = fluid.Executor(place)

    if args.enable_ce:
        fluid.default_startup_program().random_seed = 1000

    sum_cost, avg_cost, predict, token_num = transformer(
        ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size,
        ModelHyperParams.max_length + 1, ModelHyperParams.n_layer,
        ModelHyperParams.n_head, ModelHyperParams.d_key,
        ModelHyperParams.d_value, ModelHyperParams.d_model,
        ModelHyperParams.d_inner_hid, ModelHyperParams.dropout,
        ModelHyperParams.weight_sharing, TrainTaskConfig.label_smooth_eps)
    lr_scheduler = LearningRateScheduler(ModelHyperParams.d_model,
                                         TrainTaskConfig.warmup_steps,
                                         TrainTaskConfig.learning_rate)

    if args.local:
        optimizer = fluid.optimizer.Adam(
            learning_rate=lr_scheduler.learning_rate,
            beta1=TrainTaskConfig.beta1,
            beta2=TrainTaskConfig.beta2,
            epsilon=TrainTaskConfig.eps)
        optimizer.minimize(sum_cost)
    elif args.sync == False:
        optimizer = fluid.optimizer.SGD(0.003)
        optimizer.minimize(sum_cost)
    else:
        lr_decay = fluid.layers\
         .learning_rate_scheduler\
         .noam_decay(ModelHyperParams.d_model,
            TrainTaskConfig.warmup_steps)

        optimizer = fluid.optimizer.Adam(learning_rate=lr_decay,
                                         beta1=TrainTaskConfig.beta1,
                                         beta2=TrainTaskConfig.beta2,
                                         epsilon=TrainTaskConfig.eps)
        optimizer.minimize(sum_cost)

    if args.local:
        logging.info("local start_up:")
        train_loop(exe, fluid.default_main_program(), dev_count, sum_cost,
                   avg_cost, lr_scheduler, token_num, predict)
    else:
        port = os.getenv("PADDLE_PORT", "6174")
        pserver_ips = os.getenv("PADDLE_PSERVERS")  # ip,ip...
        eplist = []
        for ip in pserver_ips.split(","):
            eplist.append(':'.join([ip, port]))
        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
        trainers = int(os.getenv("PADDLE_TRAINERS_NUM", "0"))
        current_endpoint = os.getenv("POD_IP") + ":" + port
        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
        t = fluid.DistributeTranspiler()
        t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)

        if training_role == "PSERVER":
            current_endpoint = os.getenv("POD_IP") + ":" + os.getenv(
                "PADDLE_PORT")
            if not current_endpoint:
                logging.critical("need env SERVER_ENDPOINT")
                exit(1)
            pserver_prog = t.get_pserver_program(current_endpoint)
            pserver_startup = t.get_startup_program(current_endpoint,
                                                    pserver_prog)

            logging.info("psserver begin run")
            #with open('pserver_startup.desc', 'w') as f:
            #    f.write(str(pserver_startup))
            #with open('pserver_prog.desc', 'w') as f:
            #    f.write(str(pserver_prog))
            exe.run(pserver_startup)
            exe.run(pserver_prog)
        elif training_role == "TRAINER":

            trainer_prog = t.get_trainer_program()
            with open('trainer_prog.desc', 'w') as f:
                f.write(str(trainer_prog))
            train_loop(exe, trainer_prog, dev_count, sum_cost, avg_cost,
                       lr_scheduler, token_num, predict)
        else:
            logging.info(
                "environment var TRAINER_ROLE should be TRAINER os PSERVER")
def main(args):

    # get datasets
    dataset = data.get_dataset(args.dataset,
                               args.split,
                               image_size=args.image_size,
                               data_dir=args.data_dir,
                               is_training=True)

    im_x = preprocess(dataset.x,
                      args.preprocessing_a,
                      image_size=args.image_size,
                      output_channels=args.num_channels)
    im_y = preprocess(dataset.y,
                      args.preprocessing_b,
                      image_size=args.image_size)

    # No need to use tf.train.batch
    im_x = tf.expand_dims(im_x, 0)
    im_y = tf.expand_dims(im_y, 0)

    # build models

    transformed_x = model.transformer(im_x,
                                      output_channels=dataset.num_classes,
                                      output_fn=None,
                                      scope='model/AtoB')
    transformed_y = model.transformer(im_y,
                                      output_channels=args.num_channels,
                                      scope='model/BtoA')

    cycled_x = model.transformer(transformed_x,
                                 output_channels=args.num_channels,
                                 scope='model/BtoA',
                                 reuse=True)
    cycled_y = model.transformer(transformed_y,
                                 output_channels=dataset.num_classes,
                                 output_fn=None,
                                 scope='model/AtoB',
                                 reuse=True)

    # Correct colors for outputting

    color_map = np.array(
        list(map(lambda x: x.color,
                 labels[:dataset.num_classes]))).astype(np.float32)

    image_x = (im_x + 1.0) / 2.0
    image_transformed_y = (transformed_y + 1.0) / 2.0
    image_cycled_x = (cycled_x + 1.0) / 2.0

    segmentation_y = postprocess(tf.argmax(im_y, -1), 'segmentation_to_rgb',
                                 dataset.num_classes, color_map)
    segmentation_transformed_x = postprocess(tf.argmax(transformed_x, -1),
                                             'segmentation_to_rgb',
                                             dataset.num_classes, color_map)
    segmentation_cycled_y = postprocess(tf.argmax(cycled_y,
                                                  -1), 'segmentation_to_rgb',
                                        dataset.num_classes, color_map)

    saver = tf.train.Saver(var_list=tf.get_collection(
        tf.GraphKeys.GLOBAL_VARIABLES, scope='model'))

    with tf.Session() as sess:
        # Tensorflow initializations
        sess.run(tf.get_collection(tf.GraphKeys.TABLE_INITIALIZERS))
        tf.train.start_queue_runners(sess=sess)
        sess.run(tf.global_variables_initializer())
        sess.run(tf.local_variables_initializer())

        saver.restore(sess, tf.train.latest_checkpoint(args.checkpoint_dir))

        for i in tqdm(range(args.num_batches)):
            x, y, x_t, y_t, x_c, y_c = sess.run([
                image_x, segmentation_y, segmentation_transformed_x,
                image_transformed_y, image_cycled_x, segmentation_cycled_y
            ])

            plt.subplot(231)
            plt.imshow(x[0])
            plt.subplot(232)
            plt.imshow(x_t[0])
            plt.subplot(233)
            plt.imshow(x_c[0])
            plt.subplot(234)
            plt.imshow(y[0])
            plt.subplot(235)
            plt.imshow(y_t[0])
            plt.subplot(236)
            plt.imshow(y_c[0])
            plt.show()
Example #22
0
parser = argparse.ArgumentParser()
parser.add_argument('--max_samples',
                    default=25000,
                    type=int,
                    help='maximum number of conversation pairs to use')
parser.add_argument('--max_length',
                    default=40,
                    type=int,
                    help='maximum sentence length')
parser.add_argument('--batch_size', default=64, type=int)
parser.add_argument('--num_layers', default=2, type=int)
parser.add_argument('--num_units', default=512, type=int)
parser.add_argument('--d_model', default=256, type=int)
parser.add_argument('--num_heads', default=8, type=int)
parser.add_argument('--dropout', default=0.1, type=float)
parser.add_argument('--activation', default='relu', type=str)
parser.add_argument('--epochs', default=20, type=int)

hparams = parser.parse_args()

dataset, tokenizer = get_dataset(hparams)

model = transformer(hparams)

model.load_weights('Test/cp.ckpt')

model.save("Test\model")

evaluate(hparams, model, tokenizer)
Example #23
0
    ##########################################################################################
    # experiment for n times
    for exp_times in range(CONFIG['exp_times']):

        SAVE_PATH = PJ('.', 'runs_test', DATASET, EXP_NAME, str(exp_times))
        writer = SummaryWriter(PJ(SAVE_PATH))

        # set experiment type: classifier / transformer
        if CONFIG['type'] == "classifier":
            model = classifier(backbone=CONFIG['model'],
                               k=CONFIG['k'], d=CONFIG['d'][CONFIG['concepts']][DATASET],
                               pretrained=CONFIG['pretrained'], freeze=CONFIG['freeze'])

        elif CONFIG['type'] == "transformer":
            model = transformer(backbone=CONFIG['model'], linear=CONFIG['linear'],
                                k=CONFIG['k'], d=CONFIG['d'][CONFIG['concepts']][DATASET],
                                pretrained=CONFIG['pretrained'], freeze=CONFIG['freeze'])
        else:
            assert False, "Must Assign the model type: classifier or transformer"

        # load model weight
        if CONFIG['load_model']:
            print("Loading pretrained model")
            state = torch.load(PJ(SAVE_PATH, 'best_result.pkl'))

            # load model epoch
            CONFIG['start_epoch'] = state['epoch']
            assert CONFIG['end_epoch'] > CONFIG['start_epoch'], \
                ("The start epoch is {}, and the end epoch is smaller than start epoch.", state['epoch'])

            # load model parameter
Example #24
0
    def model_fn(features, labels, mode, params):
        is_training = (mode == tf.estimator.ModeKeys.TRAIN)

        batch_size = params["batch_size"]

        mems = params["cache"]
        inp = tf.transpose(features["inputs"], [1, 0])
        tgt = tf.transpose(features["labels"], [1, 0])

        bin_sizes = train_bin_sizes if is_training else eval_bin_sizes
        if bin_sizes:
            inp_perms = [tf.transpose(features["inp_mask"], [1, 0])]
            tgt_perms = [tf.transpose(features["tgt_mask"], [1, 0])]

            head_tgt = tf.transpose(features["head_labels"], [1, 0])

            for b in range(len(bin_sizes)):
                inp_perm = tf.transpose(features["inp_perm_{}".format(b)],
                                        [1, 0, 2])
                tgt_perm = tf.transpose(features["tgt_perm_{}".format(b)],
                                        [1, 0, 2])

                inp_perms.append(inp_perm)
                tgt_perms.append(tgt_perm)
        else:
            inp_perms, tgt_perms, head_tgt = None, None, None

        if FLAGS.init == "uniform":
            initializer = tf.initializers.random_uniform(
                minval=-FLAGS.init_range, maxval=FLAGS.init_range, seed=None)
        elif FLAGS.init == "normal":
            initializer = tf.initializers.random_normal(stddev=FLAGS.init_std,
                                                        seed=None)
            proj_initializer = tf.initializers.random_normal(
                stddev=FLAGS.proj_init_std, seed=None)

        tie_projs = [False for _ in range(len(cutoffs) + 1)]
        if FLAGS.proj_share_all_but_first:
            for i in range(1, len(tie_projs)):
                tie_projs[i] = True

        tf.logging.info("Vocab size : {}".format(n_token))
        tf.logging.info("Batch size : {}".format(batch_size))

        loss, new_mems = model.transformer(dec_inp=inp,
                                           target=tgt,
                                           mems=mems,
                                           n_token=n_token,
                                           n_layer=FLAGS.n_layer,
                                           d_model=FLAGS.d_model,
                                           d_embed=FLAGS.d_embed,
                                           n_head=FLAGS.n_head,
                                           d_head=FLAGS.d_head,
                                           d_inner=FLAGS.d_inner,
                                           dropout=FLAGS.dropout,
                                           dropatt=FLAGS.dropatt,
                                           initializer=initializer,
                                           is_training=is_training,
                                           mem_len=FLAGS.mem_len,
                                           cutoffs=cutoffs,
                                           div_val=FLAGS.div_val,
                                           tie_projs=tie_projs,
                                           input_perms=inp_perms,
                                           target_perms=tgt_perms,
                                           head_target=head_tgt,
                                           same_length=FLAGS.same_length,
                                           clamp_len=FLAGS.clamp_len,
                                           use_tpu=FLAGS.use_tpu,
                                           untie_r=FLAGS.untie_r,
                                           proj_same_dim=FLAGS.proj_same_dim)

        total_loss = tf.reduce_mean(loss)

        if mode == tf.estimator.ModeKeys.EVAL:
            if FLAGS.use_tpu:
                with tf.colocate_with(total_loss):
                    total_loss = tf.contrib.tpu.cross_replica_sum(total_loss) \
                               / FLAGS.num_hosts / FLAGS.num_core_per_host
            metric_loss = tf.tile(tf.reshape(total_loss, [1, 1]),
                                  [batch_size, 1])
            eval_spec = tf.contrib.tpu.TPUEstimatorSpec(
                mode=mode,
                loss=total_loss,
                eval_metrics=(metric_fn, [metric_loss]))

            eval_spec.cache = new_mems

            return eval_spec

        # Configuring the optimization step.
        global_step = tf.train.get_global_step()

        # increase the learning rate linearly
        if FLAGS.warmup_steps > 0:
            warmup_lr = tf.to_float(global_step) / tf.to_float(FLAGS.warmup_steps) \
                        * FLAGS.learning_rate
        else:
            warmup_lr = 0.0

        # number of parameters
        num_params = np.sum(
            [np.prod(v.shape) for v in tf.trainable_variables()])
        tf.logging.info("#params: {}".format(num_params))

        # format_str = '{{:<{0}s}}\t{{}}'.format(
        #     max([len(v.name) for v in tf.trainable_variables()]))
        # for v in tf.trainable_variables():
        #   tf.logging.info(format_str.format(v.name, v.get_shape()))

        # decay the learning rate using the cosine schedule
        decay_lr = tf.train.cosine_decay(
            FLAGS.learning_rate,
            global_step=global_step - FLAGS.warmup_steps,
            decay_steps=FLAGS.train_steps - FLAGS.warmup_steps,
            alpha=FLAGS.min_lr_ratio)

        learning_rate = tf.where(global_step < FLAGS.warmup_steps, warmup_lr,
                                 decay_lr)

        if FLAGS.use_tpu:
            optimizer = tf.contrib.tpu.CrossShardOptimizer(
                tf.train.AdamOptimizer(learning_rate=learning_rate))
            #GradientDescentOptimizer
        else:
            optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)

        grads_and_vars = optimizer.compute_gradients(total_loss)
        gradients, variables = zip(*grads_and_vars)
        clipped, _ = tf.clip_by_global_norm(gradients, FLAGS.clip)
        train_op = optimizer.apply_gradients(
            zip(clipped, variables), global_step=tf.train.get_global_step())

        # Constucting TPUEstimatorSpec with cache.
        train_spec = tf.contrib.tpu.TPUEstimatorSpec(mode=mode,
                                                     loss=total_loss,
                                                     train_op=train_op)

        if FLAGS.mem_len < FLAGS.tgt_len:
            new_mems = [new_mems[:FLAGS.mem_len] for mem_t in new_mems]
        train_spec.cache = new_mems

        return train_spec
Example #25
0
def profile(args):
    print args

    if args.device == 'CPU':
        TrainTaskConfig.use_gpu = False

    if not TrainTaskConfig.use_gpu:
        place = fluid.CPUPlace()
        dev_count = multiprocessing.cpu_count()
    else:
        place = fluid.CUDAPlace(0)
        dev_count = fluid.core.get_cuda_device_count()

    exe = fluid.Executor(place)

    sum_cost, avg_cost, predict, token_num = transformer(
        ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size,
        ModelHyperParams.max_length + 1, ModelHyperParams.n_layer,
        ModelHyperParams.n_head, ModelHyperParams.d_key,
        ModelHyperParams.d_value, ModelHyperParams.d_model,
        ModelHyperParams.d_inner_hid, ModelHyperParams.dropout,
        ModelHyperParams.weight_sharing, TrainTaskConfig.label_smooth_eps)
    lr_scheduler = LearningRateScheduler(ModelHyperParams.d_model,
                                         TrainTaskConfig.warmup_steps,
                                         TrainTaskConfig.learning_rate)

    optimizer = fluid.optimizer.Adam(learning_rate=lr_scheduler.learning_rate,
                                     beta1=TrainTaskConfig.beta1,
                                     beta2=TrainTaskConfig.beta2,
                                     epsilon=TrainTaskConfig.eps)
    optimizer.minimize(sum_cost)

    # Initialize the parameters.
    if TrainTaskConfig.ckpt_path:
        fluid.io.load_persistables(exe, TrainTaskConfig.ckpt_path)
        lr_scheduler.current_steps = TrainTaskConfig.start_step
    else:
        exe.run(fluid.framework.default_startup_program())

    # Disable all sorts for they will be done in the 1st batch.
    train_data = reader.DataReader(
        src_vocab_fpath=args.src_vocab_fpath,
        trg_vocab_fpath=args.trg_vocab_fpath,
        fpattern=args.train_file_pattern,
        use_token_batch=args.use_token_batch,
        batch_size=args.batch_size *
        (1 if args.use_token_batch else dev_count),
        pool_size=args.pool_size,
        sort_type='none',
        shuffle=False,
        shuffle_batch=False,
        start_mark=args.special_token[0],
        end_mark=args.special_token[1],
        unk_mark=args.special_token[2],
        # count start and end tokens out
        max_length=ModelHyperParams.max_length - 2,
        clip_last_batch=False)
    train_data = read_multiple(reader=train_data.batch_generator,
                               count=dev_count if args.use_token_batch else 1)

    if dev_count > 1:
        build_strategy = fluid.BuildStrategy()
        build_strategy.gradient_scale_strategy = fluid.BuildStrategy.GradientScaleStrategy.Customized
        train_exe = fluid.ParallelExecutor(
            use_cuda=TrainTaskConfig.use_gpu,
            loss_name=sum_cost.name,
            main_program=fluid.default_main_program(),
            build_strategy=build_strategy)

    print("Warming up ...")
    train_loop(exe if dev_count == 1 else train_exe,
               fluid.default_main_program(), False, 3, train_data, dev_count,
               sum_cost, avg_cost, lr_scheduler, token_num, predict)

    print("\nProfiling ...")
    if dev_count == 1:
        with profiler.profiler('All', 'total', '/tmp/profile_file'):
            total_time, exec_time = train_loop(exe,
                                               fluid.default_main_program(),
                                               True, args.num_iters,
                                               train_data, dev_count, sum_cost,
                                               avg_cost, lr_scheduler,
                                               token_num, predict)
    else:
        total_time, exec_time = train_loop(train_exe,
                                           fluid.default_main_program(), True,
                                           args.num_iters, train_data,
                                           dev_count, sum_cost, avg_cost,
                                           lr_scheduler, token_num, predict)
    print("Elapsed time: total %f s, in executor %f s" %
          (total_time, exec_time))
Example #26
0
def test_context(exe, train_exe, dev_count):
    # Context to do validation.
    test_prog = fluid.Program()
    startup_prog = fluid.Program()
    if args.enable_ce:
        test_prog.random_seed = 1000
        startup_prog.random_seed = 1000
    with fluid.program_guard(test_prog, startup_prog):
        with fluid.unique_name.guard():
            sum_cost, avg_cost, predict, token_num, pyreader = transformer(
                ModelHyperParams.src_vocab_size,
                ModelHyperParams.trg_vocab_size,
                ModelHyperParams.max_length + 1,
                ModelHyperParams.n_layer,
                ModelHyperParams.n_head,
                ModelHyperParams.d_key,
                ModelHyperParams.d_value,
                ModelHyperParams.d_model,
                ModelHyperParams.d_inner_hid,
                ModelHyperParams.prepostprocess_dropout,
                ModelHyperParams.attention_dropout,
                ModelHyperParams.relu_dropout,
                ModelHyperParams.preprocess_cmd,
                ModelHyperParams.postprocess_cmd,
                ModelHyperParams.weight_sharing,
                TrainTaskConfig.label_smooth_eps,
                use_py_reader=args.use_py_reader,
                is_test=True)
    test_prog = test_prog.clone(for_test=True)
    test_data = prepare_data_generator(args,
                                       is_test=True,
                                       count=dev_count,
                                       pyreader=pyreader)

    exe.run(startup_prog)
    test_exe = fluid.ParallelExecutor(use_cuda=TrainTaskConfig.use_gpu,
                                      main_program=test_prog,
                                      share_vars_from=train_exe)

    def test(exe=test_exe, pyreader=pyreader):
        test_total_cost = 0
        test_total_token = 0

        if args.use_py_reader:
            pyreader.start()
            data_generator = None
        else:
            data_generator = test_data()
        while True:
            try:
                feed_dict_list = prepare_feed_dict_list(
                    data_generator, False, dev_count)
                outs = test_exe.run(fetch_list=[sum_cost.name, token_num.name],
                                    feed=feed_dict_list)
            except (StopIteration, fluid.core.EOFException):
                # The current pass is over.
                if args.use_py_reader:
                    pyreader.reset()
                break
            sum_cost_val, token_num_val = np.array(outs[0]), np.array(outs[1])
            test_total_cost += sum_cost_val.sum()
            test_total_token += token_num_val.sum()
        test_avg_cost = test_total_cost / test_total_token
        test_ppl = np.exp([min(test_avg_cost, 100)])
        return test_avg_cost, test_ppl

    return test
Example #27
0
def train(args):
    is_local = os.getenv("PADDLE_IS_LOCAL", "1")
    print is_local
    if is_local == '0':
        args.local = False
    else:
        args.local = True
    print args

    training_role = os.getenv("TRAINING_ROLE", "TRAINER")

    if training_role == "PSERVER":
        place = fluid.CPUPlace()
    else:
        place = fluid.CUDAPlace(
            0) if TrainTaskConfig.use_gpu else fluid.CPUPlace()

    exe = fluid.Executor(place)

    if TrainTaskConfig.use_gpu and training_role != "PSERVER":
        dev_count = fluid.core.get_cuda_device_count()

    sum_cost, avg_cost, predict, token_num = transformer(
        ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size,
        ModelHyperParams.max_length + 1, ModelHyperParams.n_layer,
        ModelHyperParams.n_head, ModelHyperParams.d_key,
        ModelHyperParams.d_value, ModelHyperParams.d_model,
        ModelHyperParams.d_inner_hid, ModelHyperParams.dropout,
        TrainTaskConfig.label_smooth_eps)

    lr_decay = fluid.layers\
        .learning_rate_scheduler\
        .noam_decay(ModelHyperParams.d_model,
                    TrainTaskConfig.warmup_steps)

    optimizer = fluid.optimizer.Adam(learning_rate=lr_decay,
                                     beta1=TrainTaskConfig.beta1,
                                     beta2=TrainTaskConfig.beta2,
                                     epsilon=TrainTaskConfig.eps)
    optimizer.minimize(sum_cost)

    def train_loop(exe, train_progm):
        def read_multiple(reader,
                          count=dev_count if args.use_token_batch else 1,
                          clip_last=True):
            """
            Stack data from reader for multi-devices.
            """
            def __impl__():
                res = []
                for item in reader():
                    res.append(item)
                    if len(res) == count:
                        yield res
                        res = []
                if len(res) == count:
                    yield res
                elif not clip_last:
                    data = []
                    for item in res:
                        data += item
                    if len(data) > count:
                        inst_num_per_part = len(data) // count
                        yield [
                            data[inst_num_per_part * i:inst_num_per_part *
                                 (i + 1)] for i in range(count)
                        ]

            return __impl__

        def split_data(data, num_part=dev_count):
            """
            Split data for each device.
            """
            if len(data) == num_part:
                return data
            data = data[0]
            inst_num_per_part = len(data) // num_part
            return [
                data[inst_num_per_part * i:inst_num_per_part * (i + 1)]
                for i in range(num_part)
            ]

        # Initialize the parameters.
        if TrainTaskConfig.ckpt_path:
            fluid.io.load_persistables(exe, TrainTaskConfig.ckpt_path)
            lr_scheduler.current_steps = TrainTaskConfig.start_step
        else:
            print "init fluid.framework.default_startup_program"
            exe.run(fluid.framework.default_startup_program())

        train_data = reader.DataReader(
            src_vocab_fpath=args.src_vocab_fpath,
            trg_vocab_fpath=args.trg_vocab_fpath,
            fpattern=args.train_file_pattern,
            use_token_batch=args.use_token_batch,
            batch_size=args.batch_size *
            (1 if args.use_token_batch else dev_count),
            pool_size=args.pool_size,
            sort_type=args.sort_type,
            shuffle=args.shuffle,
            shuffle_batch=args.shuffle_batch,
            start_mark=args.special_token[0],
            end_mark=args.special_token[1],
            unk_mark=args.special_token[2],
            clip_last_batch=False)

        train_data = read_multiple(reader=train_data.batch_generator)
        build_strategy = fluid.BuildStrategy()
        # Since the token number differs among devices, customize gradient scale to
        # use token average cost among multi-devices. and the gradient scale is
        # `1 / token_number` for average cost.
        build_strategy.gradient_scale_strategy = fluid.BuildStrategy.GradientScaleStrategy.Customized
        #'''
        train_exe = fluid.ParallelExecutor(use_cuda=TrainTaskConfig.use_gpu,
                                           loss_name=sum_cost.name,
                                           main_program=train_progm,
                                           build_strategy=build_strategy)

        #'''

        def test_context():
            # Context to do validation.
            test_program = train_progm.clone()
            with fluid.program_guard(test_program):
                test_program = fluid.io.get_inference_program([avg_cost])

            val_data = reader.DataReader(
                src_vocab_fpath=args.src_vocab_fpath,
                trg_vocab_fpath=args.trg_vocab_fpath,
                fpattern=args.val_file_pattern,
                use_token_batch=args.use_token_batch,
                batch_size=args.batch_size *
                (1 if args.use_token_batch else dev_count),
                pool_size=args.pool_size,
                sort_type=args.sort_type,
                start_mark=args.special_token[0],
                end_mark=args.special_token[1],
                unk_mark=args.special_token[2],
                clip_last_batch=False,
                shuffle=False,
                shuffle_batch=False)

            test_exe = fluid.ParallelExecutor(use_cuda=TrainTaskConfig.use_gpu,
                                              main_program=test_program,
                                              share_vars_from=train_exe)

            def test(exe=test_exe):
                test_total_cost = 0
                test_total_token = 0
                test_data = read_multiple(reader=val_data.batch_generator)
                for batch_id, data in enumerate(test_data()):
                    feed_list = []
                    for place_id, data_buffer in enumerate(split_data(data)):
                        data_input_dict, util_input_dict, _ = prepare_batch_input(
                            data_buffer, data_input_names, util_input_names,
                            ModelHyperParams.eos_idx, ModelHyperParams.eos_idx,
                            ModelHyperParams.n_head, ModelHyperParams.d_model)
                        feed_list.append(
                            dict(data_input_dict.items() +
                                 util_input_dict.items()))

                    outs = exe.run(feed=feed_list,
                                   fetch_list=[sum_cost.name, token_num.name])
                    sum_cost_val, token_num_val = np.array(outs[0]), np.array(
                        outs[1])
                    test_total_cost += sum_cost_val.sum()
                    test_total_token += token_num_val.sum()
                test_avg_cost = test_total_cost / test_total_token
                test_ppl = np.exp([min(test_avg_cost, 100)])
                return test_avg_cost, test_ppl

            return test

        if args.val_file_pattern is not None:
            test = test_context()

        data_input_names = encoder_data_input_fields + decoder_data_input_fields[:
                                                                                 -1] + label_data_input_fields
        util_input_names = encoder_util_input_fields + decoder_util_input_fields
        init = False
        for pass_id in xrange(TrainTaskConfig.pass_num):
            pass_start_time = time.time()
            for batch_id, data in enumerate(train_data()):
                feed_list = []
                total_num_token = 0
                #lr_rate = lr_scheduler.update_learning_rate()
                for place_id, data_buffer in enumerate(split_data(data)):
                    data_input_dict, util_input_dict, num_token = prepare_batch_input(
                        data_buffer, data_input_names, util_input_names,
                        ModelHyperParams.eos_idx, ModelHyperParams.eos_idx,
                        ModelHyperParams.n_head, ModelHyperParams.d_model)
                    total_num_token += num_token
                    feed_list.append(
                        dict(data_input_dict.items() +
                             util_input_dict.items()))

                    if not init:
                        for pos_enc_param_name in pos_enc_param_names:
                            pos_enc = position_encoding_init(
                                ModelHyperParams.max_length + 1,
                                ModelHyperParams.d_model)
                            feed_list[place_id][pos_enc_param_name] = pos_enc
                for feed_dict in feed_list:
                    feed_dict[
                        sum_cost.name +
                        "@GRAD"] = 1. / total_num_token if TrainTaskConfig.use_avg_cost else np.asarray(
                            [1.], dtype="float32")
                outs = train_exe.run(
                    fetch_list=[sum_cost.name, token_num.name], feed=feed_list)
                #outs = exe.run(train_progm,fetch_list=[sum_cost.name, token_num.name],feed=feed_list[0])
                sum_cost_val, token_num_val = np.array(outs[0]), np.array(
                    outs[1])
                total_sum_cost = sum_cost_val.sum(
                )  # sum the cost from multi-devices
                total_token_num = token_num_val.sum()
                total_avg_cost = total_sum_cost / total_token_num
                print(
                    "epoch: %d, batch: %d, sum loss: %f, avg loss: %f, ppl: %f"
                    % (pass_id, batch_id, total_sum_cost, total_avg_cost,
                       np.exp([min(total_avg_cost, 100)])))
                init = True
            # Validate and save the model for inference.
            print("epoch: %d, " % pass_id +
                  ("val avg loss: %f, val ppl: %f, " %
                   test() if args.val_file_pattern is not None else "") +
                  "consumed %fs" % (time.time() - pass_start_time))
            fluid.io.save_persistables(
                exe,
                os.path.join(TrainTaskConfig.ckpt_dir,
                             "pass_" + str(pass_id) + ".checkpoint"))
            fluid.io.save_inference_model(
                os.path.join(TrainTaskConfig.model_dir,
                             "pass_" + str(pass_id) + ".infer.model"),
                data_input_names[:-2] + util_input_names, [predict], exe)

    if args.local:
        print("local start_up:")
        train_loop(exe, fluid.default_main_program())
    else:
        port = os.getenv("PADDLE_PORT", "6174")
        pserver_ips = os.getenv("PADDLE_PSERVERS")  # ip,ip...
        eplist = []
        for ip in pserver_ips.split(","):
            eplist.append(':'.join([ip, port]))
        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
        trainers = int(os.getenv("PADDLE_TRAINERS_NUM", "0"))
        current_endpoint = os.getenv("POD_IP") + ":" + port
        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
        t = fluid.DistributeTranspiler()
        t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)

        if training_role == "PSERVER":
            current_endpoint = os.getenv("POD_IP") + ":" + os.getenv(
                "PADDLE_PORT")
            if not current_endpoint:
                print("need env SERVER_ENDPOINT")
                exit(1)
            pserver_prog = t.get_pserver_program(current_endpoint)
            pserver_startup = t.get_startup_program(current_endpoint,
                                                    pserver_prog)

            print "psserver begin run"
            with open('pserver_startup', 'w') as f:
                f.write(str(pserver_startup))
            with open('pserver_prog', 'w') as f:
                f.write(str(pserver_prog))
            exe.run(pserver_startup
                    )  #, save_program_to_file="./pserver_startup.desc")
            exe.run(
                pserver_prog)  #, save_program_to_file="./pserver_loop.desc")
        elif training_role == "TRAINER":

            trainer_prog = t.get_trainer_program()
            with open('trainer_prog', 'w') as f:
                f.write(str(trainer_prog))
            train_loop(exe, trainer_prog)
        else:
            print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
def main():
    args = parse_args()
    place = fluid.CUDAPlace(0) if TrainTaskConfig.use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)

    sum_cost, avg_cost, predict, token_num = transformer(
        ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size,
        ModelHyperParams.max_length + 1, ModelHyperParams.n_layer,
        ModelHyperParams.n_head, ModelHyperParams.d_key,
        ModelHyperParams.d_value, ModelHyperParams.d_model,
        ModelHyperParams.d_inner_hid, ModelHyperParams.dropout,
        TrainTaskConfig.label_smooth_eps)

    lr_scheduler = LearningRateScheduler(ModelHyperParams.d_model,
                                         TrainTaskConfig.warmup_steps,
                                         TrainTaskConfig.learning_rate)
    optimizer = fluid.optimizer.Adam(learning_rate=lr_scheduler.learning_rate,
                                     beta1=TrainTaskConfig.beta1,
                                     beta2=TrainTaskConfig.beta2,
                                     epsilon=TrainTaskConfig.eps)
    optimizer.minimize(sum_cost)

    dev_count = fluid.core.get_cuda_device_count()

    train_data = paddle.batch(paddle.dataset.wmt16.train(
        ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size),
                              batch_size=TrainTaskConfig.batch_size)

    # Program to do validation.
    test_program = fluid.default_main_program().clone()
    with fluid.program_guard(test_program):
        test_program = fluid.io.get_inference_program([avg_cost])
    val_data = paddle.batch(paddle.dataset.wmt16.validation(
        ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size),
                            batch_size=TrainTaskConfig.batch_size)

    def test(exe):
        test_total_cost = 0
        test_total_token = 0
        test_data = read_multiple(reader=val_data, count=dev_count)
        for batch_id, data in enumerate(test_data()):
            feed_list = []
            for place_id, data_buffer in enumerate(data):
                data_input_dict, util_input_dict, _ = prepare_batch_input(
                    data_buffer, data_input_names, util_input_names,
                    ModelHyperParams.eos_idx, ModelHyperParams.eos_idx,
                    ModelHyperParams.n_head, ModelHyperParams.d_model)
                feed_list.append(
                    dict(data_input_dict.items() + util_input_dict.items()))

            outs = exe.run(feed=feed_list,
                           fetch_list=[sum_cost.name, token_num.name])
            sum_cost_val, token_num_val = np.array(outs[0]), np.array(outs[1])
            test_total_cost += sum_cost_val.sum()
            test_total_token += token_num_val.sum()
        test_avg_cost = test_total_cost / test_total_token
        test_ppl = np.exp([min(test_avg_cost, 100)])
        return test_avg_cost, test_ppl

    # Initialize the parameters.
    if TrainTaskConfig.ckpt_path:
        fluid.io.load_persistables(exe, TrainTaskConfig.ckpt_path)
        lr_scheduler.current_steps = TrainTaskConfig.start_step
    else:
        exe.run(fluid.framework.default_startup_program())

    data_input_names = encoder_data_input_fields + decoder_data_input_fields[:
                                                                             -1] + label_data_input_fields
    util_input_names = encoder_util_input_fields + decoder_util_input_fields

    train_exe = fluid.ParallelExecutor(use_cuda=TrainTaskConfig.use_gpu,
                                       loss_name=sum_cost.name)

    test_exe = fluid.ParallelExecutor(use_cuda=TrainTaskConfig.use_gpu,
                                      main_program=test_program,
                                      share_vars_from=train_exe)

    init = False
    train_data = read_multiple(reader=train_data, count=dev_count)

    for pass_id in xrange(TrainTaskConfig.pass_num):
        pass_start_time = time.time()
        for batch_id, data in enumerate(train_data()):
            feed_list = []
            total_num_token = 0
            lr_rate = lr_scheduler.update_learning_rate()
            for place_id, data_buffer in enumerate(data):
                data_input_dict, util_input_dict, num_token = prepare_batch_input(
                    data_buffer, data_input_names, util_input_names,
                    ModelHyperParams.eos_idx, ModelHyperParams.eos_idx,
                    ModelHyperParams.n_head, ModelHyperParams.d_model)
                total_num_token += num_token
                feed_list.append(
                    dict(data_input_dict.items() + util_input_dict.items() +
                         {lr_scheduler.learning_rate.name: lr_rate}.items()))

                if not init:
                    for pos_enc_param_name in pos_enc_param_names:
                        tensor = position_encoding_init(
                            ModelHyperParams.max_length + 1,
                            ModelHyperParams.d_model)
                        feed_list[place_id][pos_enc_param_name] = tensor
            for feed_dict in feed_list:
                feed_dict[
                    sum_cost.name +
                    "@GRAD"] = 1. / total_num_token if TrainTaskConfig.use_avg_cost else np.asarray(
                        [1.], dtype="float32")
            outs = train_exe.run(fetch_list=[sum_cost.name, token_num.name],
                                 feed=feed_list)
            sum_cost_val, token_num_val = np.array(outs[0]), np.array(outs[1])
            total_sum_cost = sum_cost_val.sum(
            )  # sum the cost from multi devices
            total_token_num = token_num_val.sum()
            total_avg_cost = total_sum_cost / total_token_num
            print("epoch: %d, batch: %d, sum loss: %f, avg loss: %f, ppl: %f" %
                  (pass_id, batch_id, total_sum_cost, total_avg_cost,
                   np.exp([min(total_avg_cost, 100)])))
            init = True
        pass_end_time = time.time()
        # Validate and save the model for inference.
        val_avg_cost, val_ppl = test(test_exe)
        time_consumed = pass_end_time - pass_start_time
        print("pass_id = " + str(pass_id) + " time_consumed = " +
              str(time_consumed))
        if pass_id == TrainTaskConfig.pass_num - 1:
            if args.gpu_card_num == 1:
                test_avg_ppl_kpi.add_record(np.array(val_ppl, dtype='float32'))
                train_pass_duration_kpi.add_record(time_consumed)
                test_avg_ppl_kpi.persist()
                train_pass_duration_kpi.persist()
            else:
                test_avg_ppl_kpi_card4.add_record(
                    np.array(val_ppl, dtype='float32'))
                train_pass_duration_kpi_card4.add_record(time_consumed)
                test_avg_ppl_kpi_card4.persist()
                train_pass_duration_kpi_card4.persist()
Example #29
0
train_dataset = dset.ImageFolder(root=train_path)
test_dataset = dset.ImageFolder(root=test_path)

dataSet = Omniglot(train_dataset, transform=data_transforms, way=way)
testSet = Omniglot(test_dataset, transform=test_transforms, way=way)

testLoader = DataLoader(testSet, batch_size=32, shuffle=False, num_workers=16)
dataLoader = DataLoader(dataSet, batch_size=128,\
                        shuffle=False, num_workers=16)

loss_fn = torch.nn.CrossEntropyLoss(size_average=False)
# loss_fn = nn.DataParallel(loss_fn)
loss_fn.cuda()

net = transformer(way, img_size, N, d_model, d_k, h, drop_rate)

# net = nn.DataParallel(net)
net.cuda()
net.train()

train_loss = []
optimizer = torch.optim.Adam(net.parameters(),
                             lr=learning_rate,
                             betas=(0.9, 0.98),
                             eps=1e-9)
# optimizer = ScheduledOptim(optimizer, d_model, warmup_steps)
optimizer.zero_grad()


def right_error(output, truth):