コード例 #1
0
ファイル: main.py プロジェクト: xueeinstein/PaddleHelix
def infer(args):
    log.info("loading data")
    raw_dataset = GraphPropPredDataset(name=args.dataset_name)
    args.num_class = raw_dataset.num_tasks
    args.eval_metric = raw_dataset.eval_metric
    args.task_type = raw_dataset.task_type

    test_ds = MolDataset(args, raw_dataset, mode="test")

    fn = MgfCollateFn(args, mode="test")

    test_loader = Dataloader(test_ds,
                             batch_size=args.batch_size,
                             num_workers=1,
                             collate_fn=fn)
    test_loader = PDataset.from_generator_func(test_loader)

    est = propeller.Learner(MgfModel, args, args.model_config)

    mgf_list = []
    for soft_mgf in est.predict(test_loader,
                                ckpt_path=args.model_path_for_infer,
                                split_batch=True):
        mgf_list.append(soft_mgf)

    mgf = np.concatenate(mgf_list)
    log.info("saving features")
    np.save(
        "dataset/%s/soft_mgf_feat.npy" % (args.dataset_name.replace("-", "_")),
        mgf)
コード例 #2
0
ファイル: test.py プロジェクト: Meiyim/paddle-estimator
    def __init__(self, config, mode, run_config):
        for k, v in config.items():
            log.info("%s: %s" % (k, repr(v)))
        self.hidden_size = config['hidden_size']
        self.vocab_size = config['vocab_size']
        self.embedding_size = config['embedding_size']
        self.num_layers = config['num_layers']

        self.learning_rate = config['learning_rate']
        self.mode = mode
コード例 #3
0
ファイル: client.py プロジェクト: ljw23/ERNIE-1
 def __init__(self, address, batch_size=128, num_coroutine=10, timeout=10.):
     self.loop = asyncio.new_event_loop()
     asyncio.set_event_loop(self.loop)
     context = zmq.asyncio.Context()
     self.socket_pool = [
         context.socket(zmq.REQ) for _ in range(num_coroutine)
     ]
     log.info("Connecting to server... %s" % address)
     for socket in self.socket_pool:
         socket.connect(address)
     self.num_coroutine = num_coroutine
     self.batch_size = batch_size
     self.timeout = int(timeout * 1000)
コード例 #4
0
ファイル: optimization.py プロジェクト: Yelrose/PGL
def layer_decay(param, param_last, learning_rate, decay_rate, n_layers):
    #encoder params
    delta = param - param_last
    encoder_layer_m = re.search("encoder_layer_([0-9]*)_", param.name)
    if encoder_layer_m is not None:
        layer = int(encoder_layer_m.group(1))
        ratio = decay_rate**(n_layers + 1 - layer)
        log.info('layer deay %s: ratio %s.' % (param.name, ratio))
        param_update = param + (ratio - 1) * delta
    elif "embedding" in param.name:
        ratio = decay_rate**(n_layers + 2)
        param_update = param + (ratio - 1) * delta
    else:
        param_update = None
    return param_update
コード例 #5
0
ファイル: dataset.py プロジェクト: xueeinstein/PaddleHelix
    def __init__(self, config, raw_dataset, mode='train'):
        self.config = config
        self.raw_dataset = raw_dataset
        self.mode = mode

        log.info("preprocess graph data in %s" % self.__class__.__name__)
        self.graph_list = []

        log.info("loading mgf feature")
        mgf_feature = np.load(self.config.mgf_file)
        log.info(["the shape of mgf feature is: ", mgf_feature.shape])

        for i in range(len(self.raw_dataset)):
            # num_nodes, edge_index, node_feat, edge_feat, label
            graph, label = self.raw_dataset[i]
            num_nodes = graph['num_nodes']
            node_feat = graph['node_feat'].copy()
            edges = list(zip(graph["edge_index"][0], graph["edge_index"][1]))
            edge_feat = graph['edge_feat'].copy()

            new_graph = {}
            new_graph['num_nodes'] = num_nodes
            new_graph['node_feat'] = node_feat
            new_graph['edges'] = edges
            new_graph['edge_feat'] = edge_feat
            new_graph['mgf'] = mgf_feature[i, :].reshape(-1, )

            self.graph_list.append(new_graph)
コード例 #6
0
ファイル: link_predict.py プロジェクト: Yelrose/PGL
    def __init__(self, graph_work_path):
        trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
        trainer_count = int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
        log.info("trainer_id: %s, trainer_count: %s." %
                 (trainer_id, trainer_count))

        edges = np.load(os.path.join(graph_work_path, "train_data.npy"),
                        allow_pickle=True)
        # edges is bidirectional.
        train_usr = edges[trainer_id::trainer_count, 0]
        train_ad = edges[trainer_id::trainer_count, 1]
        returns = {"train_data": [train_usr, train_ad]}

        if os.path.exists(os.path.join(graph_work_path, "neg_samples.npy")):
            neg_samples = np.load(os.path.join(graph_work_path,
                                               "neg_samples.npy"),
                                  allow_pickle=True)
            if neg_samples.size != 0:
                train_negs = neg_samples[trainer_id::trainer_count]
                returns["train_data"].append(train_negs)
        log.info("Load train_data done.")
        self.data = returns
コード例 #7
0
def seq2seq(model, tokenizer, args):
    log.info('Training starts with args: %r' % args)
    attn_id = tokenizer.vocab[args.attn_token]

    def gen_mask(batch_ids, mask_type='bidi', query_len=None, pad_value=0):
        if query_len is None:
            query_len = batch_ids.shape[1]
        if mask_type != 'empty':
            mask = (batch_ids != pad_value).astype(np.float32)
            mask = np.tile(np.expand_dims(mask, 1), [1, query_len, 1])
            if mask_type == 'causal':
                assert query_len == batch_ids.shape[1]
                mask = np.tril(mask)
            elif mask_type == 'causal_without_diag':
                assert query_len == batch_ids.shape[1]
                mask = np.tril(mask, -1)
            elif mask_type == 'diag':
                assert query_len == batch_ids.shape[1]
                mask = np.stack([np.diag(np.diag(m)) for m in mask], 0)
        else:
            mask_type == 'empty'
            mask = np.zeros_like(batch_ids).astype(np.float32)
            mask = np.tile(np.expand_dims(mask, 1), [1, query_len, 1])
        return mask

    def make_some_noice(ids):
        if args.use_random_noice:
            noice_ids = np.random.randint(
                1, len(tokenizer.vocab), size=ids.shape)
        else:
            noice_ids = np.ones_like(ids) * tokenizer.vocab['[NOISE]']
        pos, = np.where(np.ones_like(ids))
        np.random.shuffle(pos)
        pos = pos[:int(args.noise_prob * len(pos))]
        ids[pos, ] = noice_ids[pos, ]
        return ids

    def map_fn(example_id, src_ids, tgt_ids):
        src_ids = src_ids[:args.max_encode_len]
        tgt_ids = tgt_ids[:args.max_decode_len]
        src_ids, src_sids = tokenizer.build_for_ernie(src_ids)
        src_pids = np.arange(len(src_ids))

        tgt_ids, tgt_sids = tokenizer.build_for_ernie(tgt_ids)
        tgt_pids = np.arange(len(tgt_ids)) + len(src_ids)  # continues position
        tgt_sids = np.ones_like(tgt_sids) * args.tgt_type_id

        attn_ids = np.ones_like(tgt_ids) * attn_id
        if args.noise_prob > 0.:
            tgt_labels = deepcopy(tgt_ids)
            tgt_ids = make_some_noice(tgt_ids)  #corrupted
        else:
            tgt_labels = tgt_ids

        return (example_id, src_ids, src_pids, src_sids, tgt_ids, tgt_pids,
                tgt_sids, attn_ids, tgt_labels)

    def after_padding(example_id, src_ids, src_pids, src_sids, tgt_ids,
                      tgt_pids, tgt_sids, attn_ids, tgt_labels):
        '''
        attention mask:
        ***  src,  tgt, attn
        src  00,   01,   11
        tgt  10,   11,   12
        attn 20,   21,   22

        ***   s1, s2 | t1 t2 t3| attn1 attn2 attn3
        s1    1,  1  | 0, 0, 0,| 0,    0,    0,
        s2    1,  1  | 0, 0, 0,| 0,    0,    0,
        -
        t1    1,  1, | 1, 0, 0,| 0,    0,    0,
        t2    1,  1, | 1, 1, 0,| 0,    0,    0,
        t3    1,  1, | 1, 1, 1,| 0,    0,    0,
        -
        attn1 1,  1, | 0, 0, 0,| 1,    0,    0,
        attn2 1,  1, | 1, 0, 0,| 0,    1,    0,
        attn3 1,  1, | 1, 1, 0,| 0,    0,    1,

        for details, see Fig3. https://arxiv.org/abs/2001.11314
        '''

        src_len = src_ids.shape[1]
        tgt_len = tgt_ids.shape[1]
        mask_00 = gen_mask(src_ids, 'bidi', query_len=src_len)
        mask_01 = gen_mask(tgt_ids, 'empty', query_len=src_len)
        mask_02 = gen_mask(attn_ids, 'empty', query_len=src_len)

        mask_10 = gen_mask(src_ids, 'bidi', query_len=tgt_len)
        mask_11 = gen_mask(tgt_ids, 'causal', query_len=tgt_len)
        mask_12 = gen_mask(attn_ids, 'empty', query_len=tgt_len)

        mask_20 = gen_mask(src_ids, 'bidi', query_len=tgt_len)
        mask_21 = gen_mask(tgt_ids, 'causal_without_diag', query_len=tgt_len)
        mask_22 = gen_mask(attn_ids, 'diag', query_len=tgt_len)
        '''
        mask = np.concatenate([
            np.concatenate([mask_00, mask_01, mask_02], 2),
            np.concatenate([mask_10, mask_11, mask_12], 2),
            np.concatenate([mask_20, mask_21, mask_22], 2),
        ], 1)

        ids = np.concatenate([src_ids, tgt_ids, attn_ids], 1)
        pids = np.concatenate([src_pids, tgt_pids, tgt_pids], 1)
        sids = np.concatenate([src_sids, tgt_sids, tgt_sids], 1)

        '''

        mask_src_2_src = mask_00
        mask_tgt_2_srctgt = np.concatenate([mask_10, mask_11], 2)
        mask_attn_2_srctgtattn = np.concatenate([mask_20, mask_21, mask_22], 2)

        tgt_labels = tgt_labels[np.where(tgt_labels != 0)]
        return (example_id, src_ids, src_sids, src_pids, tgt_ids, tgt_sids,
                tgt_pids, attn_ids, mask_src_2_src, mask_tgt_2_srctgt,
                mask_attn_2_srctgtattn, tgt_labels)

    bytes_vocab = {k.encode('utf8'): v for k, v in tokenizer.vocab.items()}
    feature_column = propeller.data.FeatureColumns([
        propeller.data.LabelColumn('id'),
        propeller.data.TextColumn(
            'src', unk_id=tokenizer.unk_id, vocab_dict=bytes_vocab),
        propeller.data.TextColumn(
            'tgt', unk_id=tokenizer.unk_id, vocab_dict=bytes_vocab),
    ])

    train_ds = feature_column.build_dataset('train', data_dir=os.path.join(args.data_dir, 'train'), shuffle=True, repeat=True, use_gz=False) \
                                   .map(map_fn) \
                                   .padded_batch(args.bsz) \
                                   .map(after_padding)


    dev_ds = feature_column.build_dataset('dev', data_dir=os.path.join(args.data_dir, 'dev'), shuffle=False, repeat=False, use_gz=False) \
                                   .map(map_fn) \
                                   .padded_batch(args.eval_bsz) \
                                   .map(after_padding) \
                                   .shard(env.nranks, env.dev_id)

    vocab_size, _ = model.word_emb.weight.shape
    model = P.DataParallel(model)
    g_clip = P.nn.ClipGradByGlobalNorm(1.0)
    param_name_to_exclue_from_weight_decay = re.compile(
        r'.*layer_norm_scale|.*layer_norm_bias|.*b_0')
    lr_scheduler = P.optimizer.lr.LambdaDecay(
        args.lr,
        get_warmup_and_linear_decay(
            args.max_steps, int(args.warmup_proportion * args.max_steps)))

    opt = P.optimizer.AdamW(
        learning_rate=lr_scheduler,
        parameters=model.parameters(),
        weight_decay=args.wd,
        apply_decay_param_fun=lambda n: param_name_to_exclue_from_weight_decay.match(n),
        grad_clip=g_clip)

    scaler = P.amp.GradScaler(enable=args.use_amp)
    attn_id = tokenizer.vocab[args.attn_token]
    create_if_not_exists(args.save_dir)
    if args.predict_output_dir:
        create_if_not_exists(args.predict_output_dir)

    with P.amp.auto_cast(enable=args.use_amp):
        for step, data in enumerate(
                P.io.DataLoader(
                    train_ds, places=P.CUDAPlace(env.dev_id),
                    batch_size=None)):
            (example_id, src_ids, src_sids, src_pids, tgt_ids, tgt_sids,
             tgt_pids, attn_ids, mask_src_2_src, mask_tgt_2_srctgt,
             mask_attn_2_srctgtattn, tgt_labels) = data

            _, __, info = model(
                src_ids,
                sent_ids=src_sids,
                pos_ids=src_pids,
                attn_bias=mask_src_2_src,
                encode_only=True)
            cached_k, cached_v = info['caches']
            _, __, info = model(
                tgt_ids,
                sent_ids=tgt_sids,
                pos_ids=tgt_pids,
                attn_bias=mask_tgt_2_srctgt,
                past_cache=(cached_k, cached_v),
                encode_only=True)
            cached_k2, cached_v2 = info['caches']
            past_cache_k = [
                P.concat([k, k2], 1) for k, k2 in zip(cached_k, cached_k2)
            ]
            past_cache_v = [
                P.concat([v, v2], 1) for v, v2 in zip(cached_v, cached_v2)
            ]
            tgt_labels = F.one_hot(tgt_labels, vocab_size)
            if args.label_smooth > 0.:
                tgt_labels = F.label_smooth(
                    tgt_labels, epsilon=args.label_smooth)
            loss, _, __ = model(
                attn_ids,
                sent_ids=tgt_sids,
                pos_ids=tgt_pids,
                attn_bias=mask_attn_2_srctgtattn,
                past_cache=(past_cache_k, past_cache_v),
                tgt_labels=tgt_labels,
                tgt_pos=P.nonzero(attn_ids == attn_id))

            loss = scaler.scale(loss)
            loss.backward()
            scaler.minimize(opt, loss)
            model.clear_gradients()
            lr_scheduler.step()

            if step % 10 == 0:
                _lr = lr_scheduler.get_lr()
                if args.use_amp:
                    _l = (loss / scaler._scale).numpy()
                    msg = '[rank-%d][step-%d] train loss %.5f lr %.3e scaling %.3e' % (
                        env.dev_id, step, _l, _lr, scaler._scale.numpy())
                else:
                    _l = loss.numpy()
                    msg = '[rank-%d][step-%d] train loss %.5f lr %.3e' % (
                        env.dev_id, step, _l, _lr)
                log.debug(msg)

            if args.save_dir is not None and step % 1000 == 0 and env.dev_id == 0:
                P.save(model.state_dict(), args.save_dir / 'ckpt.bin')

            if args.predict_output_dir is not None and step > args.skip_eval_steps and step % args.eval_steps == 0:
                assert  args.predict_output_dir.exists(), \
                 'predict_output_dir not found: %s' % args.predict_output_dir
                log.debug('doing predict on gpu %d...' % env.dev_id)
                evaluate(model, dev_ds, step, args)
            if step > args.max_steps:
                break
        evaluate(model, dev_ds, step, args)

    if args.save_dir is not None:
        P.save(model.state_dict(), args.save_dir / 'ckpt.bin')
コード例 #8
0
def train(args):
    log.info("pretraining start")
    profile = False

    place = fluid.CUDAPlace(int(os.environ.get('FLAGS_selected_gpus', 0)))

    # set seed
    random.seed(args.seed)
    np.random.seed(args.seed)
    paddle.seed(args.seed)
    get_rng_state_tracker().add('global_seed', args.seed)
    get_rng_state_tracker().add('local_seed',
                                args.seed + fleet.worker_index() + 2021)

    # define execution strategy
    exec_strategy = fluid.ExecutionStrategy()
    exec_strategy.num_threads = 2
    exec_strategy.num_iteration_per_drop_scope = 1

    # define distribution strategy
    dist_strategy = fleet.DistributedStrategy()
    dist_strategy.execution_strategy = exec_strategy
    dist_strategy.nccl_comm_num = 3
    if args.use_recompute:
        log.info("using recompute.")
    dist_strategy.recompute = args.use_recompute
    dist_strategy.sharding = args.use_sharding
    dist_strategy.pipeline = args.num_pp > 1

    # define topology structure for dp/pp/mp
    topo = Topology(rank=fleet.worker_index(),
                    world_size=fleet.worker_num(),
                    dp=args.num_dp,
                    pp=args.num_pp,
                    sharding=args.num_sharding,
                    mp=args.num_mp)

    is_last = False
    if topo.pp.rank == (topo.pp.size - 1):
        is_last = True

    dp_sharding_rank = topo.dp.rank * topo.sharding.size + topo.sharding.rank
    dp_worldsize = topo.dp.size * topo.sharding.size
    bsz_per_dp = args.global_bsz // dp_worldsize

    micro_bsz = args.micro_bsz
    assert args.global_bsz % micro_bsz == 0, f"cannot do gradient accumulate, globa_bsz: {args.bsz} micro_bsz: {micro_bsz}"
    acc_steps = bsz_per_dp // micro_bsz

    # sharding \ model parallel \ pipeline
    assert dist_strategy.sharding == True
    dist_strategy.sharding_configs = {
        "segment_broadcast_MB": 32,
        "sharding_degree": args.num_sharding,
        "mp_degree": args.num_mp,
        "pp_degree": args.num_pp,
        "dp_degree": args.num_dp,
        "optimize_offload": True,
    }
    dist_strategy.pipeline_configs = {
        "schedule_mode": "1F1B",
        "micro_batch_size": micro_bsz,
        "accumulate_steps": acc_steps,
    }
    log.info(
        f"using globa_bsz: {args.global_bsz} micro_bsz: {micro_bsz}, acc_steps: {acc_steps}"
    )

    dist_strategy.amp = args.use_amp
    dist_strategy.amp_configs = {
        "custom_white_list": ['softmax', 'layer_norm', 'gelu'],
        "init_loss_scaling": 32768,
        "decr_every_n_nan_or_inf": 2,
        "incr_every_n_steps": 1000,
        "incr_ratio": 2.0,
        "use_dynamic_loss_scaling": True,
        "decr_ratio": 0.5,
        "use_pure_fp16": False,
        "use_fp16_guard": False,
    }

    dist_strategy.lamb = args.use_lamb
    dist_strategy.lamb_configs = {
        'lamb_weight_decay':
        0.01,
        'exclude_from_weight_decay':
        ['layer_norm_bias', 'layer_norm_scale', '.b_0']
    }

    train_program = fluid.Program()
    startup_program = fluid.Program()
    with fluid.program_guard(train_program, startup_program):
        with fluid.unique_name.guard():
            graph_vars = create_model(args, 'train', micro_bsz,
                                      dp_sharding_rank, dp_worldsize, topo)
            data_loader = graph_vars['data_loader']
            for op in train_program.global_block().ops:
                if op.type == 'fill_constant':
                    op._set_attr(
                        'op_device', "gpu:0"
                    )  # XXX: hack: https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/layers/tensor.py#L1376

            if args.use_recompute:
                dist_strategy.recompute_configs = {
                    "checkpoints": graph_vars['checkpoints'],
                    # "enable_offload": args.use_offload,
                    # "checkpoint_shape": [micro_bsz, args.max_seq_len, 4096],
                }

            log.debug("base lr: {}".format(args.learning_rate))
            scheduled_lr = linear_warmup_decay(
                learning_rate=args.learning_rate,
                warmup_steps=args.warmup_steps,
                num_train_steps=args.num_train_steps)

            clip_norm_thres = 1.0
            if paddlenlp.ops.optimizer._jit_compile():
                optimizer = paddlenlp.ops.optimizer.AdamwOptimizer(
                    learning_rate=scheduled_lr,
                    grad_clip=fluid.clip.GradientClipByGlobalNorm(
                        clip_norm=clip_norm_thres),
                    weight_decay=args.weight_decay,
                    apply_decay_param_fun=apply_weight_decay_fun)
            else:
                optimizer = fluid.optimizer.Adam(
                    learning_rate=scheduled_lr,
                    grad_clip=fluid.clip.GradientClipByGlobalNorm(
                        clip_norm=clip_norm_thres),
                    #multi_precision=True,
                    #weight_decay=args.weight_decay, # merge this pr to use weight_decay: https://github.com/PaddlePaddle/Paddle/pull/29248
                    #exclude_from_weight_decay_fn=exclude_from_weight_decay
                )

            optimizer = fleet.distributed_optimizer(optimizer, dist_strategy)
            log.info(f"using dist strategy: {dist_strategy}")

            optimizer.minimize(graph_vars['total_loss'])

            final_strategy = fleet._final_strategy()
            applied_meta_list = fleet._get_applied_meta_list()
            log.info("final strategy: {}".format(final_strategy))
            log.info("applied_meta_list: {}".format(applied_meta_list))

    program_desc_dir = os.path.join(args.output_dir, "program_desc")
    if not os.path.isdir(program_desc_dir):
        os.mkdir(program_desc_dir)

    with open(
            program_desc_dir + "/main_program.txt.%d" %
        (int(os.environ.get('FLAGS_selected_gpus', 0))), 'w') as f:
        f.write(str(train_program))

    with open(
            program_desc_dir + "/startup_program.txt.%d" %
        (int(os.environ.get('FLAGS_selected_gpus', 0))), 'w') as f:
        f.write(str(startup_program))

    exe = fluid.Executor(place)
    exe.run(startup_program)

    optimizer.amp_init(place)

    #save_path = os.path.join(args.output_dir, 'step_0')
    #log.debug("saving models to {}".format(save_path))
    #save_persistables(exe, save_path, train_program)

    if args.init_checkpoint and args.init_checkpoint != "":
        log.info(' ')
        log.info(
            '############################WARNING############################')
        log.info(
            '####### using ini_checkpoint, not init_pretraining_params ####')
        log.info(
            '## meaning hyper param e.g. lr will inherit from checkpoint ##')
        log.info(
            '###############################################################')
        init_checkpoint(exe, args.init_checkpoint, train_program)
        log.info(' ')

    output_dir = args.output_dir
    save_steps = args.save_steps
    total_time = 0
    cost_vals, lm_losses, sop_accs = [], [], []
    global_steps = args.global_steps + 1
    steps = 0
    log_path = 'train_log/node-%d' % fleet.worker_index()
    start_time = time.time()
    with LogWriter(os.path.join(args.output_dir, log_path)) as swriter:
        data_loader.start()
        while True:
            #if steps < global_steps:
            #    steps += 1
            #    continue
            if not is_last:
                fetch_list = []
            else:
                fetch_list = [
                    graph_vars['total_loss'], graph_vars['mean_mask_lm_loss'],
                    scheduled_lr
                ]
                if args.use_sop:
                    fetch_list.extend(
                        [graph_vars['sop_acc'], graph_vars['sop_loss']])
                if args.use_amp:
                    loss_scaling = train_program.global_block(
                    ).vars['loss_scaling_0']
                    fetch_list.append(loss_scaling)

            ret = exe.run(train_program, fetch_list=fetch_list
                          )  # run one mini-batch(=acc_steps micro-batch)
            #use_program_cache=True)

            steps += 1

            if is_last:
                if args.use_sop and args.use_amp:
                    cost_val, lm_loss, lr, sop_acc, sop_loss, loss_scaling_0 = ret
                elif args.use_sop:
                    cost_val, lm_loss, lr, sop_acc, sop_loss = ret
                elif args.use_amp:
                    cost_val, lm_loss, lr, loss_scaling_0 = ret
                else:
                    cost_val, lm_loss, lr = ret
                cost_vals.append(cost_val[0])
                lm_losses.append(lm_loss[0])
                if args.use_sop:
                    sop_accs.append(sop_acc[0])

                if steps > 0 and (steps % args.log_steps) == 0:
                    end_time = time.time()
                    total_time = end_time - start_time
                    cost_val = np.mean(cost_vals)
                    lm_loss = np.mean(lm_losses)
                    swriter.add_scalar('loss/total_loss', cost_val, steps)
                    swriter.add_scalar('loss/mlm_loss', lm_loss, steps)
                    swriter.add_scalar('lr/scheduled_lr', lr[0], steps)

                    if args.use_sop:
                        sop_acc = np.mean(sop_accs)
                        swriter.add_scalar('loss/sop_loss', sop_loss, steps)
                        swriter.add_scalar('train/sop_acc', sop_acc, steps)
                    else:
                        sop_acc = 0.0

                    if args.use_amp:
                        swriter.add_scalar('lr/loss_scaling',
                                           loss_scaling_0[0], steps)
                    else:
                        loss_scaling_0 = [0.0]

                    log.info(
                        "worker_index: %d, step: %d, cost: %f, "
                        "mlm loss: %f, sentence order acc: %f, "
                        "speed: %f steps/s, "
                        "speed: %f samples/s, "
                        "speed: %f tokens/s, "
                        "learning rate: %.3e, loss_scalings: %f" %
                        (fleet.worker_index(), steps, cost_val, lm_loss,
                         sop_acc, args.log_steps / total_time,
                         args.log_steps * args.global_bsz / total_time,
                         args.log_steps * args.global_bsz * args.max_seq_len /
                         total_time, lr[0], loss_scaling_0[0]))

                    cost_vals, lm_losses, sop_accs = [], [], []
                    start_time = time.time()

            # TODO: add evaluation
            if steps > 0 and args.eval_steps > 0 and steps % args.eval_steps == 0:
                pass

            if steps > 0 and args.save_steps > 0 and steps % args.save_steps == 0:
                if args.use_hybrid_dp and fleet.worker_index() > 8:
                    continue
                save_path = os.path.join(output_dir, 'step_' + str(steps))
                log.debug("saving models to {}".format(save_path))
                save_persistables(exe, save_path, train_program)

            if steps == args.num_train_steps:
                if args.use_hybrid_dp and fleet.worker_index() > 8:
                    continue
                save_path = os.path.join(output_dir,
                                         'final_step_' + str(steps))
                save_persistables(exe, save_path, train_program)
                log.debug("saving final models to {}".format(save_path))
                log.debug("end of training, total steps: {}".format(steps))
コード例 #9
0
ファイル: main.py プロジェクト: xueeinstein/PaddleHelix
        mgf)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='gnn')
    parser.add_argument("--config", type=str, default="./config.yaml")
    parser.add_argument("--task_name", type=str, default="task_name")
    parser.add_argument("--infer_model", type=str, default=None)
    parser.add_argument("--log_id", type=str, default=None)
    args = parser.parse_args()

    if args.infer_model is not None:
        config = prepare_config(args.config, isCreate=False, isSave=False)
        config.model_path_for_infer = args.infer_model
        infer(config)
    else:
        config = prepare_config(args.config, isCreate=True, isSave=True)

        log_to_file(log, config.log_dir, config.log_filename)

        if config.warm_start_from is not None:
            log.info("loading model config from %s" %
                     config.pretrained_config_file)
            pretrained_config = prepare_config(config.pretrained_config_file)
            pretrained_model_config = pretrained_config.pretrained_model_config
        else:
            pretrained_model_config = config.model_config

        config.log_id = args.log_id
        train(config, pretrained_model_config)
コード例 #10
0
    parser.add_argument('--lr', type=float, default=3e-5, help='learning rate')
    parser.add_argument('--save_dir', type=str, default=None, help='model output directory')
    parser.add_argument('--n_best_size', type=int, default=20, help='nbest prediction to keep')
    parser.add_argument('--max_answer_length', type=int, default=100, help='max answer span')
    parser.add_argument('--wd', type=float, default=0.00, help='weight decay, aka L2 regularizer')

    args = parser.parse_args()

    tokenizer = ErnieTokenizer.from_pretrained(args.from_pretrained)

    if not os.path.exists(args.train_file):
        raise RuntimeError('input data not found at %s' % args.train_file)
    if not os.path.exists(args.dev_file):
        raise RuntimeError('input data not found at %s' % args.dev_file)

    log.info('making train/dev data...')
    train_examples = mrc_reader.read_files(args.train_file, is_training=True)
    train_features = mrc_reader.convert_example_to_features(train_examples, args.max_seqlen, tokenizer, is_training=True)

    dev_examples = mrc_reader.read_files(args.dev_file, is_training=False)
    dev_features = mrc_reader.convert_example_to_features(dev_examples, args.max_seqlen, tokenizer, is_training=False)

    log.info('train examples: %d, features: %d' % (len(train_examples), len(train_features)))

    def map_fn(unique_id, example_index, doc_span_index, tokens, token_to_orig_map, token_is_max_context, token_ids, position_ids, text_type_ids, start_position, end_position):
        if start_position is None:
            start_position = 0
        if end_position is None:
            end_position = 0
        return np.array(unique_id), np.array(token_ids), np.array(text_type_ids), np.array(start_position), np.array(end_position)
コード例 #11
0
ファイル: make_pretrain_data.py プロジェクト: ljw23/ERNIE-1
        ex = build_example(transposed_slots)
        write_gz(ex.SerializeToString(), to_file)
        slots = []


if __name__ == '__main__':
    parser = argparse.ArgumentParser('Pretrain Data Maker')
    parser.add_argument('src', type=str)
    parser.add_argument('tgt', type=str)
    parser.add_argument('--vocab', type=str, required=True)
    parser.add_argument('-v', '--verbose', action='store_true')
    parser.add_argument('-c', '--check', action='store_true')

    args = parser.parse_args()
    log.setLevel(logging.DEBUG)

    from tokenizing_ernie import _wordpiece
    pat = re.compile(r'([a-zA-Z0-9]+|\S)')

    vocab = {
        j.strip().split(b'\t')[0].decode('utf8'): i
        for i, j in enumerate(open(args.vocab, 'rb'))
    }
    vocab_set = set(vocab.keys())

    with open(args.src, 'rb') as from_file, gzip.open(args.tgt,
                                                      'wb') as to_file:
        log.info('making gz from bb %s ==> %s' % (from_file, to_file))
        build_bb(from_file, to_file)
        log.info('done: %s' % to_file)
コード例 #12
0
ファイル: main.py プロジェクト: xueeinstein/PaddleHelix
 def cmp_fn(old, new):
     if old['eval'][args.metrics] - new['eval'][args.metrics] > 0:
         log.info("best %s eval result: %s" % (args.metrics, new['eval']))
         return True
     else:
         return False
コード例 #13
0
def seq2seq(model, tokenizer, args):
    log.info('Training starts with args: %r' % args)
    attn_id = tokenizer.vocab[args.attn_token]

    def gen_mask(batch_ids, mask_type='bidi', query_len=None, pad_value=0):
        if query_len is None:
            query_len = batch_ids.shape[1]
        if mask_type != 'empty':
            mask = (batch_ids != pad_value).astype(np.float32)
            mask = np.tile(np.expand_dims(mask, 1), [1, query_len, 1])
            if mask_type == 'causal':
                assert query_len == batch_ids.shape[1]
                mask = np.tril(mask)
            elif mask_type == 'causal_without_diag':
                assert query_len == batch_ids.shape[1]
                mask = np.tril(mask, -1)
            elif mask_type == 'diag':
                assert query_len == batch_ids.shape[1]
                mask = np.stack([np.diag(np.diag(m)) for m in mask], 0)
        else:
            mask_type == 'empty'
            mask = np.zeros_like(batch_ids).astype(np.float32)
            mask = np.tile(np.expand_dims(mask, 1), [1, query_len, 1])
        return mask

    def make_some_noice(ids):
        if args.use_random_noice:
            noice_ids = np.random.randint(1,
                                          len(tokenizer.vocab),
                                          size=ids.shape)
        else:
            noice_ids = np.ones_like(ids) * tokenizer.vocab['[NOISE]']
        pos, = np.where(np.ones_like(ids))
        np.random.shuffle(pos)
        pos = pos[:int(args.noise_prob * len(pos))]
        ids[pos, ] = noice_ids[pos, ]
        return ids

    def map_fn(example_id, src_ids, tgt_ids):
        src_ids = src_ids[:args.max_encode_len]
        tgt_ids = tgt_ids[:args.max_decode_len]
        src_ids, src_sids = tokenizer.build_for_ernie(src_ids)
        src_pids = np.arange(len(src_ids))

        tgt_ids, tgt_sids = tokenizer.build_for_ernie(tgt_ids)
        tgt_pids = np.arange(len(tgt_ids)) + len(src_ids)  # continues position
        tgt_sids = np.ones_like(tgt_sids) * args.tgt_type_id

        attn_ids = np.ones_like(tgt_ids) * attn_id
        if args.noise_prob > 0.:
            tgt_labels = deepcopy(tgt_ids)
            tgt_ids = make_some_noice(tgt_ids)  #corrupted
        else:
            tgt_labels = tgt_ids

        return (example_id, src_ids, src_pids, src_sids, tgt_ids, tgt_pids,
                tgt_sids, attn_ids, tgt_labels)

    def after_padding(example_id, src_ids, src_pids, src_sids, tgt_ids,
                      tgt_pids, tgt_sids, attn_ids, tgt_labels):
        '''
        attention mask:
        ***  src,  tgt, attn
        src  00,   01,   11
        tgt  10,   11,   12
        attn 20,   21,   22

        ***   s1, s2 | t1 t2 t3| attn1 attn2 attn3
        s1    1,  1  | 0, 0, 0,| 0,    0,    0,
        s2    1,  1  | 0, 0, 0,| 0,    0,    0,
        -
        t1    1,  1, | 1, 0, 0,| 0,    0,    0,
        t2    1,  1, | 1, 1, 0,| 0,    0,    0,
        t3    1,  1, | 1, 1, 1,| 0,    0,    0,
        -
        attn1 1,  1, | 0, 0, 0,| 1,    0,    0,
        attn2 1,  1, | 1, 0, 0,| 0,    1,    0,
        attn3 1,  1, | 1, 1, 0,| 0,    0,    1,

        for details, see Fig3. https://arxiv.org/abs/2001.11314
        '''

        src_len = src_ids.shape[1]
        tgt_len = tgt_ids.shape[1]
        mask_00 = gen_mask(src_ids, 'bidi', query_len=src_len)
        mask_01 = gen_mask(tgt_ids, 'empty', query_len=src_len)
        mask_02 = gen_mask(attn_ids, 'empty', query_len=src_len)

        mask_10 = gen_mask(src_ids, 'bidi', query_len=tgt_len)
        mask_11 = gen_mask(tgt_ids, 'causal', query_len=tgt_len)
        mask_12 = gen_mask(attn_ids, 'empty', query_len=tgt_len)

        mask_20 = gen_mask(src_ids, 'bidi', query_len=tgt_len)
        mask_21 = gen_mask(tgt_ids, 'causal_without_diag', query_len=tgt_len)
        mask_22 = gen_mask(attn_ids, 'diag', query_len=tgt_len)
        '''
        mask = np.concatenate([
            np.concatenate([mask_00, mask_01, mask_02], 2),
            np.concatenate([mask_10, mask_11, mask_12], 2),
            np.concatenate([mask_20, mask_21, mask_22], 2),
        ], 1)

        ids = np.concatenate([src_ids, tgt_ids, attn_ids], 1)
        pids = np.concatenate([src_pids, tgt_pids, tgt_pids], 1)
        sids = np.concatenate([src_sids, tgt_sids, tgt_sids], 1)

        '''

        mask_src_2_src = mask_00
        mask_tgt_2_srctgt = np.concatenate([mask_10, mask_11], 2)
        mask_attn_2_srctgtattn = np.concatenate([mask_20, mask_21, mask_22], 2)

        tgt_labels = tgt_labels[np.where(tgt_labels != 0)]
        return (example_id, src_ids, src_sids, src_pids, tgt_ids, tgt_sids,
                tgt_pids, attn_ids, mask_src_2_src, mask_tgt_2_srctgt,
                mask_attn_2_srctgtattn, tgt_labels)

    bytes_vocab = {k.encode('utf8'): v for k, v in tokenizer.vocab.items()}
    feature_column = propeller.data.FeatureColumns([
        propeller.data.LabelColumn('id'),
        propeller.data.TextColumn('src',
                                  unk_id=tokenizer.unk_id,
                                  vocab_dict=bytes_vocab),
        propeller.data.TextColumn('tgt',
                                  unk_id=tokenizer.unk_id,
                                  vocab_dict=bytes_vocab),
    ])

    train_ds = feature_column.build_dataset('train', data_dir=os.path.join(args.data_dir, 'train'), shuffle=False, repeat=True, use_gz=False) \
                                   .map(map_fn)

    dev_ds = feature_column.build_dataset('dev', data_dir=os.path.join(args.data_dir, 'dev'), shuffle=False, repeat=False, use_gz=False) \
                                   .map(map_fn) \
                                   .padded_batch(args.eval_bsz) \
                                   .map(after_padding)

    log.debug('shard %d of %d' %
              (D.parallel.Env().dev_id, D.parallel.Env().nranks))
    train_ds = train_ds.shard(
        D.parallel.Env().nranks,
        D.parallel.Env().dev_id).shuffle(10000).padded_batch(
            args.bsz).map(after_padding)
    dev_ds = dev_ds.shard(D.parallel.Env().nranks, D.parallel.Env().dev_id)

    shapes = [[None, None]] * 7 + [[None, None, None]] * 3 + [[None]]
    types = ['int64'] * 11

    train_ds.data_shapes = shapes
    train_ds.data_types = types
    dev_ds.data_shapes = shapes
    dev_ds.data_types = types

    vocab_size, _ = model.word_emb.weight.shape
    ctx = D.parallel.prepare_context()
    model = D.parallel.DataParallel(model, ctx)
    g_clip = F.clip.GradientClipByGlobalNorm(1.0)
    opt = AdamW(learning_rate=LinearDecay(
        args.lr, int(args.warmup_proportion * args.max_steps), args.max_steps),
                parameter_list=model.parameters(),
                weight_decay=args.wd,
                grad_clip=g_clip)
    attn_id = tokenizer.vocab[args.attn_token]
    for step, data in enumerate(train_ds.start(place)):
        (example_id, src_ids, src_sids, src_pids, tgt_ids, tgt_sids, tgt_pids,
         attn_ids, mask_src_2_src, mask_tgt_2_srctgt, mask_attn_2_srctgtattn,
         tgt_labels) = data

        _, __, info = model(src_ids,
                            sent_ids=src_sids,
                            pos_ids=src_pids,
                            attn_bias=mask_src_2_src,
                            encode_only=True)
        cached_k, cached_v = info['caches']
        _, __, info = model(tgt_ids,
                            sent_ids=tgt_sids,
                            pos_ids=tgt_pids,
                            attn_bias=mask_tgt_2_srctgt,
                            past_cache=(cached_k, cached_v),
                            encode_only=True)
        cached_k2, cached_v2 = info['caches']
        past_cache_k = [
            L.concat([k, k2], 1) for k, k2 in zip(cached_k, cached_k2)
        ]
        past_cache_v = [
            L.concat([v, v2], 1) for v, v2 in zip(cached_v, cached_v2)
        ]
        if args.label_smooth > 0.:
            tgt_labels = L.label_smooth(F.one_hot(tgt_labels, vocab_size),
                                        epsilon=args.label_smooth)
        loss, _, __ = model(attn_ids,
                            sent_ids=tgt_sids,
                            pos_ids=tgt_pids,
                            attn_bias=mask_attn_2_srctgtattn,
                            past_cache=(past_cache_k, past_cache_v),
                            tgt_labels=tgt_labels,
                            tgt_pos=L.where(attn_ids == attn_id))

        scaled_loss = model.scale_loss(loss)
        scaled_loss.backward()
        model.apply_collective_grads()
        opt.minimize(scaled_loss)
        model.clear_gradients()
        if step % 10 == 0:
            loss = loss.numpy()
            ppl = np.exp(loss)
            log.debug('[step %d]train loss %.5f, ppl %.5f, lr %.3e' %
                      (step, loss, ppl, opt.current_step_lr()))
        if args.save_dir is not None and step % 1000 == 0 and D.parallel.Env(
        ).dev_id == 0:
            F.save_dygraph(model.state_dict(), args.save_dir)
        if args.predict_output_dir is not None and step > args.skip_eval_steps and step % args.eval_steps == 0:
            assert os.path.exists(
                args.predict_output_dir
            ), 'predict_output_dir not found: %s' % args.predict_output_dir
            log.debug('doing predict on gpu %d...' % D.parallel.Env().dev_id)
            evaluate(model, dev_ds, step, args)
        if step > args.max_steps:
            break
    evaluate(model, dev_ds, step, args)

    if args.save_dir is not None:
        F.save_dygraph(model.state_dict(), args.save_dir)
コード例 #14
0
ファイル: pretrain.py プロジェクト: leo038/ERNIE
def make_pretrain_dataset(name, dir, vocab, hparams, args):
    gz_files = glob(dir)
    if not gz_files:
        raise ValueError('train data not found in %s' % dir)

    log.info('read from %s' % '\n'.join(gz_files))
    max_input_seqlen = args.max_seqlen
    max_pretrain_seqlen = lambda: max_input_seqlen if r.random(
    ) > 0.15 else r.randint(1, max_input_seqlen)  # short sentence rate

    def _parse_gz(record_str):  # function that takes python_str as input
        ex = propeller.data.example_pb2.SequenceExample()
        ex.ParseFromString(record_str)
        doc = [
            np.array(f.int64_list.value, dtype=np.int64)
            for f in ex.feature_lists.feature_list['txt'].feature
        ]
        doc_seg = [
            np.array(f.int64_list.value, dtype=np.int64)
            for f in ex.feature_lists.feature_list['segs'].feature
        ]
        return doc, doc_seg

    def bb_to_segments(filename):
        ds = Dataset.from_record_file(filename).map(_parse_gz)
        iterable = iter(ds)

        def gen():
            buf, size = [], 0
            iterator = iter(ds)
            while 1:
                doc, doc_seg = next(iterator)
                for line, line_seg in zip(doc, doc_seg):
                    #line = np.array(sp_model.SampleEncodeAsIds(line, -1, 0.1), dtype=np.int64) # 0.1 means large variance on sentence piece result
                    if len(line) == 0:
                        continue
                    line = np.array(
                        line
                    )  # 0.1 means large variance on sentence piece result
                    line_seg = np.array(line_seg)
                    size += len(line)
                    buf.append(np.stack([line, line_seg]).transpose())
                    if size > max_input_seqlen:
                        yield buf,
                        buf, size = [], 0
                if len(buf) != 0:
                    yield buf,
                    buf, size = [], 0

        return Dataset.from_generator_func(gen)

    def sample_negative(dataset):
        def gen():
            iterator = iter(dataset)
            while True:
                chunk_a, = next(iterator)
                #chunk_b, = next(iterator)

                seqlen = max_pretrain_seqlen()
                seqlen_a = r.randint(1, seqlen)
                seqlen_b = seqlen - seqlen_a
                len_a = list(accumulate([len(c) for c in chunk_a]))
                buf_a = [c for c, l in zip(chunk_a, len_a)
                         if l < seqlen_a]  #always take the first one
                buf_b = [
                    c for c, l in zip(chunk_a, len_a) if seqlen_a <= l < seqlen
                ]

                if r.random() < 0.5:  #pos or neg
                    label = np.int64(1)
                else:
                    label = np.int64(0)
                    buf_a, buf_b = buf_b, buf_a

                if not (len(buf_a) and len(buf_b)):
                    continue
                a = np.concatenate(buf_a)
                b = np.concatenate(buf_b)
                #log.debug(a)
                #log.debug(b)
                sample, seg_info, token_type = build_pair(
                    a, b, args.max_seqlen,
                    vocab)  #negative sample might exceed max seqlen
                yield sample, seg_info, token_type, label

        ds = propeller.data.Dataset.from_generator_func(gen)
        return ds

    def after(sentence, seg_info, segments, label):
        batch_size, seqlen = sentence.shape
        sentence, mask_pos, mlm_label = apply_mask(sentence, seg_info,
                                                   args.mask_rate,
                                                   hparams.vocab_size, vocab)

        ra = r.random()
        if ra < args.check:
            print('***')
            print('\n'.join([
                str(j) + '\t' + '|'.join(map(str, i))
                for i, j in zip(sentence.tolist(), label)
            ]))
            print('***')
            print('\n'.join(['|'.join(map(str, i))
                             for i in seg_info.tolist()]))
            print('***')
            print('|'.join(map(str, mlm_label.tolist())))
            print('***')

        return sentence, segments, mlm_label, mask_pos, label

    # pretrain pipeline
    dataset = Dataset.from_list(gz_files)
    if propeller.train.distribution.status.mode == propeller.train.distribution.DistributionMode.NCCL:
        log.info('Apply sharding in distribution env')
        dataset = dataset.shard(
            propeller.train.distribution.status.num_replica,
            propeller.train.distribution.status.replica_id)
    dataset = dataset.repeat().shuffle(buffer_size=len(gz_files))

    dataset = dataset.interleave(map_fn=bb_to_segments,
                                 cycle_length=len(gz_files),
                                 block_length=1)
    dataset = dataset.shuffle(
        buffer_size=1000)  #must shuffle to ensure negative sample randomness
    dataset = sample_negative(dataset)
    dataset = dataset.padded_batch(hparams.batch_size, (0, 0, 0, 0)).map(after)
    dataset.name = name
    return dataset
コード例 #15
0
                                   .padded_batch(hparams.batch_size, (0, 0)) \
                                   .map(after)

    def unsuperve_before(text_a, teacher_text_a):
        teacher_sentence, teacher_segments = utils.data.build_1_pair(
            teacher_text_a,
            max_seqlen=args.teacher_max_seqlen,
            cls_id=teacher_cls_id,
            sep_id=teacher_sep_id)
        sentence_a = text_a[:args.max_seqlen]
        return sentence_a, teacher_sentence, teacher_segments

    client = InferenceClient(args.teacher_host,
                             batch_size=args.server_batch_size,
                             num_coroutine=args.num_coroutine)
    log.info('teacher host %s' % args.teacher_host)

    def ask_teacher_for_label(sentence_a, teacher_sentence, teacher_segments):
        sentence_a, teacher_sentence, teacher_segments = utils.data.expand_dims(
            sentence_a, teacher_sentence, teacher_segments)
        teacher_label, = client(teacher_sentence, teacher_segments)
        teacher_label = teacher_label[:, :]
        return sentence_a, teacher_label

    unsup_train_ds = unsupervise_feature_column.build_dataset('unsup_train', data_dir=os.path.join(args.data_dir, 'unsup_train_aug'), shuffle=True, repeat=True, use_gz=False) \
                                   .buffered(100) \
                                   .map(unsuperve_before) \
                                   .padded_batch(hparams.batch_size, (0, 0, 0)) \
                                   .map(ask_teacher_for_label)

    train_ds = utils.data.interleave(train_ds, unsup_train_ds)
コード例 #16
0
ファイル: main.py プロジェクト: xueeinstein/PaddleHelix
def train(args, pretrained_model_config=None):
    log.info("loading data")
    raw_dataset = GraphPropPredDataset(name=args.dataset_name)
    args.num_class = raw_dataset.num_tasks
    args.eval_metric = raw_dataset.eval_metric
    args.task_type = raw_dataset.task_type

    train_ds = MolDataset(args, raw_dataset)

    args.eval_steps = math.ceil(len(train_ds) / args.batch_size)
    log.info("Total %s steps (eval_steps) every epoch." % (args.eval_steps))

    fn = MgfCollateFn(args)

    train_loader = Dataloader(train_ds,
                              batch_size=args.batch_size,
                              num_workers=args.num_workers,
                              shuffle=args.shuffle,
                              stream_shuffle_size=args.shuffle_size,
                              collate_fn=fn)

    # for evaluating
    eval_train_loader = train_loader
    eval_train_loader = PDataset.from_generator_func(eval_train_loader)

    train_loader = multi_epoch_dataloader(train_loader, args.epochs)
    train_loader = PDataset.from_generator_func(train_loader)

    if args.warm_start_from is not None:
        # warm start setting
        def _fn(v):
            if not isinstance(v, F.framework.Parameter):
                return False
            if os.path.exists(os.path.join(args.warm_start_from, v.name)):
                return True
            else:
                return False

        ws = propeller.WarmStartSetting(predicate_fn=_fn,
                                        from_dir=args.warm_start_from)
    else:
        ws = None

    def cmp_fn(old, new):
        if old['eval'][args.metrics] - new['eval'][args.metrics] > 0:
            log.info("best %s eval result: %s" % (args.metrics, new['eval']))
            return True
        else:
            return False

    if args.log_id is not None:
        save_best_model = int(args.log_id) == 5
    else:
        save_best_model = True
    best_exporter = propeller.exporter.BestResultExporter(
        args.output_dir, (cmp_fn, save_best_model))

    eval_datasets = {"eval": eval_train_loader}

    propeller.train.train_and_eval(
        model_class_or_model_fn=MgfModel,
        params=pretrained_model_config,
        run_config=args,
        train_dataset=train_loader,
        eval_dataset=eval_datasets,
        warm_start_setting=ws,
        exporters=[best_exporter],
    )
コード例 #17
0
ファイル: main.py プロジェクト: xueeinstein/PaddleHelix
 def _worker():
     for i in range(epochs):
         log.info("BEGIN: epoch %s ..." % i)
         for batch in loader():
             yield batch
         log.info("END: epoch %s ..." % i)
コード例 #18
0
ファイル: encoder_server.py プロジェクト: RayX-X/ERNIE-1
                            'layer9',
                            'layer8',
                            'layer7',
                            'layer6',
                            'layer5',
                            'layer4',
                            'layer3',
                            'layer2',
                            'layer1',
                        ],
                        default='pooler')
    args = parser.parse_args()

    if args.verbose:
        log.setLevel(logging.DEBUG)
    cuda_env = os.getenv("CUDA_VISIBLE_DEVICES")
    if cuda_env is None:
        raise RuntimeError('CUDA_VISIBLE_DEVICES not set')
    n_devices = len(cuda_env.split(","))
    if args.encode_layer.lower() == 'pooler':
        model_dir = os.path.join(args.model_dir, 'pooler')
    else:
        pat = re.compile(r'layer(\d+)')
        match = pat.match(args.encode_layer.lower())
        layer = int(match.group(1))
        model_dir = os.path.join(args.model_dir, 'enc%d' % layer)

    server = InferenceServer(model_dir, n_devices)
    log.info('propeller server listent on port %d' % args.port)
    server.listen(args.port)
コード例 #19
0
ファイル: pretrain.py プロジェクト: zs960114/ERNIE
                                batch_size=0)):
            (src_ids, sent_ids, mlm_label, mask_pos, nsp_label) = samples
            loss, mlmloss, nsploss = model(src_ids,
                                           sent_ids,
                                           labels=mlm_label,
                                           mlm_pos=mask_pos,
                                           nsp_labels=nsp_label)
            loss = scaler.scale(loss)
            loss.backward()
            scaler.minimize(opt, loss)
            model.clear_gradients()
            lr_scheduler.step()

            if step % 10 == 0:
                _lr = lr_scheduler.get_lr()
                if args.use_amp:
                    _l = (loss / scaler._scale).numpy()
                    msg = '[rank-%d][step-%d] train loss %.5f lr %.3e scaling %.3e' % (
                        env.dev_id, step, _l, _lr, scaler._scale.numpy())
                else:
                    _l = loss.numpy()
                    msg = '[rank-%d][step-%d] train loss %.5f lr %.3e' % (
                        env.dev_id, step, _l, _lr)
                log.debug(msg)
            if step % 1000 == 0 and env.dev_id == 0:
                log.debug('saveing...')
                P.save(model.state_dict(), args.save_dir / 'ckpt.bin')
            if step > args.max_steps:
                break
    log.info('done')
コード例 #20
0
                        default=None,
                        help='inference model output directory')
    parser.add_argument('--init_checkpoint', type=str, default=None)
    parser.add_argument('--save_dir',
                        type=str,
                        default=None,
                        help='model output directory')
    parser.add_argument('--wd',
                        type=float,
                        default=0.01,
                        help='weight decay, aka L2 regularizer')

    args = parser.parse_args()

    place = F.CUDAPlace(D.parallel.Env().dev_id)
    D.guard(place).__enter__()

    ernie = ErnieModelForGeneration.from_pretrained(args.from_pretrained)
    tokenizer = ErnieTokenizer.from_pretrained(args.from_pretrained,
                                               mask_token=None)
    rev_dict = {v: k for k, v in tokenizer.vocab.items()}
    rev_dict[tokenizer.pad_id] = ''  # replace [PAD]
    rev_dict[tokenizer.unk_id] = ''  # replace [PAD]

    if args.init_checkpoint is not None:
        log.info('loading checkpoint from %s' % args.init_checkpoint)
        sd, _ = D.load_dygraph(args.init_checkpoint)
        ernie.set_dict(sd)

    seq2seq(ernie, tokenizer, args)
コード例 #21
0
def make_pretrain_dataset(name, gz_files, is_train, vocab, batch_size,
                          vocab_size, max_seqlen, global_rank, world_size):
    max_input_seqlen = max_seqlen
    max_pretrain_seqlen = lambda: max_input_seqlen if r.random(
    ) > 0.15 else r.randint(1, max_input_seqlen)  # short sentence rate

    def _parse_gz(record_str):  # function that takes python_str as input
        ex = propeller.data.example_pb2.SequenceExample()
        ex.ParseFromString(record_str)
        doc = [
            np.array(f.int64_list.value, dtype=np.int64)
            for f in ex.feature_lists.feature_list['txt'].feature
        ]
        doc_seg = [
            np.array(f.int64_list.value, dtype=np.int64)
            for f in ex.feature_lists.feature_list['segs'].feature
        ]
        return doc, doc_seg

    def _mereg_docseg(doc_seg):  # ngram masking
        ret, span_ctr, ngram_ctr, ngram, last = [], 0, 1, sample_geo(), None
        for s in doc_seg:
            if s != -1 and last is not None and s != last:
                ngram_ctr += 1
                if ngram_ctr > ngram:
                    ngram = sample_geo()
                    ngram_ctr = 1
                    span_ctr += 1
            last = s
            ret.append(span_ctr)
        ret = np.array(ret)
        assert len(doc_seg) == len(ret)
        return ret

    def bb_to_segments(filename):
        ds = Dataset.from_record_file(filename).map(_parse_gz)
        iterable = iter(ds)

        def gen():
            buf, size = [], 0
            iterator = iter(ds)
            while 1:
                doc, doc_seg = next(iterator)
                for line, line_seg in zip(doc, doc_seg):
                    #line = np.array(sp_model.SampleEncodeAsIds(line, -1, 0.1), dtype=np.int64) # 0.1 means large variance on sentence piece result
                    if len(line) == 0:
                        continue
                    line = np.array(line)
                    line_seg = np.array(line_seg)
                    line_seg = _mereg_docseg(line_seg)  # mask span
                    size += len(line)
                    buf.append(np.stack([line, line_seg]).transpose())
                    if size > max_input_seqlen:
                        yield buf,
                        buf, size = [], 0
                if len(buf) != 0:
                    yield buf,
                    buf, size = [], 0

        return Dataset.from_generator_func(gen)

    def sample_negative(dataset):
        def gen():
            iterator = iter(dataset)
            while True:
                chunk_a, = next(iterator)
                #chunk_b, = next(iterator)

                seqlen = max_pretrain_seqlen()
                seqlen_a = r.randint(1, seqlen)
                seqlen_b = seqlen - seqlen_a
                len_a = list(accumulate([len(c) for c in chunk_a]))
                buf_a = [c for c, l in zip(chunk_a, len_a)
                         if l < seqlen_a]  #always take the first one
                buf_b = [
                    c for c, l in zip(chunk_a, len_a) if seqlen_a <= l < seqlen
                ]

                if not (len(buf_a) and len(buf_b)):
                    continue
                a = np.concatenate(buf_a)
                b = np.concatenate(buf_b)
                #log.debug(a)
                #log.debug(b)
                sample, seg_info, token_type = build_pair(
                    a, b, max_seqlen,
                    vocab)  #negative sample might exceed max seqlen
                yield sample, seg_info, token_type

        ds = propeller.data.Dataset.from_generator_func(gen)
        return ds

    def after(sentence, seg_info, segments):
        batch_size, seqlen = sentence.shape
        sentence, mask_pos, mlm_label = apply_mask(sentence, seg_info, 1.,
                                                   0.15, vocab_size, vocab)
        #return {'input_ids': sentence, 'token_type_ids': segments, 'sentence_order_label': label, 'labels': mlm_label, 'mlm_mask': mlm_mask}
        sentence = sentence.reshape([-1, seqlen, 1])
        segments = segments.reshape([-1, seqlen, 1])
        mlm_label = mlm_label.reshape([-1, 1])
        mask_pos_reshape = []
        for i, p in zip(mask_pos[0], mask_pos[1]):
            p += i * seqlen
            mask_pos_reshape.append(p)
        mask_pos = np.array(mask_pos_reshape).reshape([-1, 1])
        return sentence, segments, mlm_label, mask_pos

    # pretrain pipeline
    dataset = Dataset.from_list(gz_files)
    log.info('Apply sharding in distribution env %d/%d' %
             (global_rank, world_size))
    dataset = dataset.shard(world_size, global_rank)
    log.info('read from %s' % ','.join(list(iter(dataset))))
    cycle_length = len(range(global_rank, len(gz_files), world_size))
    if is_train:
        dataset = dataset.repeat()
        #dataset = dataset.repeat().shuffle(buffer_size=len(gz_files))
        #dataset = dataset.shuffle(buffer_size=len(gz_files))
    dataset = dataset.interleave(map_fn=bb_to_segments,
                                 cycle_length=cycle_length,
                                 block_length=1)
    dataset = dataset.shuffle(
        buffer_size=10000)  # must shuffle to ensure negative sample randomness
    dataset = sample_negative(dataset)

    dataset = dataset.padded_batch(batch_size, (0, -1, 0), max_seqlen) \
                     .map(after)
    dataset.name = name
    return dataset
コード例 #22
0
ファイル: finetune_classifier.py プロジェクト: zs960114/ERNIE
                    help='checkpoint to warm start from')
parser.add_argument(
    '--use_amp',
    action='store_true',
    help=
    'only activate AMP(auto mixed precision accelatoin) on TensorCore compatible devices'
)

args = parser.parse_args()

if args.bsz > args.micro_bsz:
    assert args.bsz % args.micro_bsz == 0, 'cannot perform gradient accumulate with bsz:%d micro_bsz:%d' % (
        args.bsz, args.micro_bsz)
    acc_step = args.bsz // args.micro_bsz
    log.info(
        'performing gradient accumulate: global_bsz:%d, micro_bsz:%d, accumulate_steps:%d'
        % (args.bsz, args.micro_bsz, acc_step))
    args.bsz = args.micro_bsz
else:
    acc_step = 1

tokenizer = ErnieTokenizer.from_pretrained(args.from_pretrained)
#tokenizer = ErnieTinyTokenizer.from_pretrained(args.from_pretrained)

feature_column = propeller.data.FeatureColumns([
    propeller.data.TextColumn('seg_a',
                              unk_id=tokenizer.unk_id,
                              vocab_dict=tokenizer.vocab,
                              tokenizer=tokenizer.tokenize),
    propeller.data.TextColumn('seg_b',
                              unk_id=tokenizer.unk_id,
コード例 #23
0
ファイル: client.py プロジェクト: ljw23/ERNIE-1
 def __init__(self, address):
     self.context = zmq.Context()
     self.address = address
     self.socket = self.context.socket(zmq.REQ)
     self.socket.connect(address)
     log.info("Connecting to server... %s" % address)
コード例 #24
0
                         args.host,
                         args.port,
                         batch_size=args.batch_size,
                         num_coroutine=args.num_coroutine)
    inputs = [
        i.strip().split(b'\t') for i in open(args.input, 'rb').readlines()
    ]
    if len(inputs) == 0:
        raise ValueError('empty input')
    send_batch = args.num_coroutine * args.batch_size
    send_num = len(inputs) // send_batch + 1
    rets = []
    start = time()
    for i in range(send_num):
        slice = inputs[i * send_batch:(i + 1) * send_batch]
        if len(slice) == 0:
            continue
        columns = list(zip(*slice))
        if len(columns) > 2:
            raise ValueError('inputs file has more than 2 columns')
        ret = client(*columns)
        if len(ret.shape) == 3:
            ret = ret[:, 0, :]  # take cls
        rets.append(ret)
    end = time()
    with open(args.output, 'wb') as outf:
        arr = np.concatenate(rets, 0)
        np.save(outf, arr)
        log.info('query num: %d average latency %.5f' %
                 (len(inputs), (end - start) / len(inputs)))
コード例 #25
0
def make_pretrain_dataset(name, gz_files, is_train, vocab, batch_size,
                          vocab_size, max_seqlen, global_rank, world_size):
    max_input_seqlen = max_seqlen
    max_pretrain_seqlen = lambda: max_input_seqlen if r.random() > 0.15 else r.randint(1, max_input_seqlen)  # short sentence rate

    def _parse_gz(record_str):  # function that takes python_str as input
        ex = propeller.data.example_pb2.SequenceExample()
        ex.ParseFromString(record_str)
        doc = [
            np.array(
                f.int64_list.value, dtype=np.int64)
            for f in ex.feature_lists.feature_list['txt'].feature
        ]
        doc_seg = [
            np.array(
                f.int64_list.value, dtype=np.int64)
            for f in ex.feature_lists.feature_list['segs'].feature
        ]
        return doc, doc_seg

    def random_n_sub_sentence():
        ratio = r.random()
        n_sub_sentence = 4
        label_index_start = 0
        if ratio < float(1) / float(33):
            n_sub_sentence = 1
        elif float(1) / float(33) <= ratio < float(3) / float(33):
            n_sub_sentence = 2
        elif float(3) / float(33) <= ratio < float(9) / float(33):
            n_sub_sentence = 3
        else:
            n_sub_sentence = 4
        return n_sub_sentence

    def gen_interval(l, n_interval):
        n_needed = n_interval - 1
        split_points = sorted(r.sample(range(1, l), n_needed))
        index = [0] + split_points + [l]
        return [(index[i], index[i + 1]) for i in range(len(index) - 1)]

    def joint_sentences(buf, n_sub_sentence, interval_start_ends):
        # buf = [[text1, seg1], [text2, seg2]]
        tokens_of_sub_sentence = [[] for _ in range(n_sub_sentence)]
        segs_of_sub_sentence = [[] for _ in range(n_sub_sentence)]
        assert (len(interval_start_ends) == len(tokens_of_sub_sentence))
        for (start, end), tokens, segs in zip(interval_start_ends,
                                              tokens_of_sub_sentence,
                                              segs_of_sub_sentence):
            for chunk in buf[start:end]:
                tokens.extend(chunk[0])
                segs.extend(chunk[1])
        new_buf = []
        for t_merge, s_merge in zip(tokens_of_sub_sentence,
                                    segs_of_sub_sentence):
            new_buf.append([t_merge, s_merge])
        return new_buf

    def _mereg_docseg(doc_seg):  # ngram masking
        ret, span_ctr, ngram_ctr, ngram, last = [], 0, 1, sample_geo(), None
        for s in doc_seg:
            if s != -1 and last is not None and s != last:
                ngram_ctr += 1
                if ngram_ctr > ngram:
                    ngram = sample_geo()
                    ngram_ctr = 1
                    span_ctr += 1
            last = s
            ret.append(span_ctr)
        ret = np.array(ret)
        assert len(doc_seg) == len(ret)
        return ret

    def bb_to_segments(filename):
        ds = Dataset.from_record_file(filename).map(_parse_gz)
        iterable = iter(ds)

        def gen():
            buf, size = [], 0
            iterator = iter(ds)
            while 1:
                doc, doc_seg = next(iterator)
                n_sub_sentence = random_n_sub_sentence()
                max_num_tokens = max_input_seqlen - (n_sub_sentence + 1)
                for line, line_seg in zip(doc, doc_seg):
                    if len(line) == 0:
                        continue
                    line = list(line)
                    line_seg = np.array(line_seg)
                    line_seg = list(_mereg_docseg(line_seg))  # mask span
                    #size += len(line)
                    #buf.append([line, line_seg])

                    if size + len(line) > max_num_tokens:
                        if len(buf) > n_sub_sentence:
                            interval = gen_interval(len(buf), n_sub_sentence)
                            buf = joint_sentences(buf, n_sub_sentence, interval)
                        elif len(buf) < n_sub_sentence:
                            max_num_tokens = max_input_seqlen - (len(buf) + 1)

                        if len(buf) > 0:
                            truncate_seqs(buf, max_num_tokens)
                            yield buf,
                            buf, size = [[line, line_seg]], len(line)
                        n_sub_sentence = random_n_sub_sentence()
                        max_num_tokens = max_input_seqlen - (n_sub_sentence + 1)
                    else:
                        size += len(line)
                        buf.append([line, line_seg])

                if len(buf) != 0:
                    if len(buf) > n_sub_sentence:
                        interval = gen_interval(len(buf), n_sub_sentence)
                        buf = joint_sentences(buf, n_sub_sentence, interval)
                    elif len(buf) < n_sub_sentence:
                        max_num_tokens = max_input_seqlen - (len(buf) + 1)
                    truncate_seqs(buf, max_num_tokens)
                    yield buf,
                    buf, size = [], 0

        return Dataset.from_generator_func(gen)

    def sample_negative(dataset):
        cls_id = vocab["[CLS]"]
        sep_id = vocab["[SEP]"]

        premutation_1_sent = [[0]]
        premutation_2_sent = [[0, 1], [1, 0]]
        premutation_3_sent = [[0, 1, 2], [0, 2, 1], [1, 0, 2], [1, 2, 0],
                              [2, 0, 1], [2, 1, 0]]
        premutation_4_sent = [
            [0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1],
            [0, 3, 1, 2], [0, 3, 2, 1], [1, 0, 2, 3], [1, 0, 3, 2],
            [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],
            [2, 0, 1, 3], [2, 0, 3, 1], [2, 1, 0, 3], [2, 1, 3, 0],
            [2, 3, 0, 1], [2, 3, 1, 0], [3, 0, 1, 2], [3, 0, 2, 1],
            [3, 1, 0, 2], [3, 1, 2, 0], [3, 2, 0, 1], [3, 2, 1, 0]
        ]

        def gen():
            iterator = iter(dataset)
            while True:
                chunks, = next(iterator)
                sample = [vocab['[CLS]']]
                seg_info = [-1]
                token_type = [0]
                label = 0

                if len(chunks) == 1:  # one sent
                    # label in [0]
                    choice_index = np.random.choice(1)
                    for index, order in enumerate(premutation_1_sent[
                            choice_index]):
                        sample += chunks[order][0] + [sep_id]
                        seg_info += chunks[order][1] + [-1]
                        token_type += [index] * len(chunks[order][0]) + [index]
                    label += choice_index

                elif len(chunks) == 2:  # two sent
                    # label in [1, 2]
                    choice_index = np.random.choice(2)
                    for index, order in enumerate(premutation_2_sent[
                            choice_index]):
                        sample += chunks[order][0] + [sep_id]
                        seg_info += chunks[order][1] + [-1]
                        token_type += [index] * len(chunks[order][0]) + [index]
                    label += choice_index + 1

                elif len(chunks) == 3:  # three sent
                    # label in [3,...,8]
                    choice_index = np.random.choice(6)
                    for index, order in enumerate(premutation_3_sent[
                            choice_index]):
                        sample += chunks[order][0] + [sep_id]
                        seg_info += chunks[order][1] + [-1]
                        token_type += [index] * len(chunks[order][0]) + [index]
                    label += choice_index + 3

                else:  # four sent
                    # label in [9,...,32]
                    choice_index = np.random.choice(24)
                    for index, order in enumerate(premutation_4_sent[
                            choice_index]):
                        sample += chunks[order][0] + [sep_id]
                        seg_info += chunks[order][1] + [-1]
                        token_type += [index] * len(chunks[order][0]) + [index]
                    label += choice_index + 9

                sample = np.array(sample)
                if len(sample) < 128:
                    continue
                seg_info = np.array(seg_info)
                token_type = np.array(token_type)
                label = np.int64(label)
                yield sample, seg_info, token_type, label

        ds = propeller.data.Dataset.from_generator_func(gen)
        return ds

    def after(sentence, seg_info, segments, label):
        batch_size, seqlen = sentence.shape
        sentence, mask_pos, mlm_label = apply_mask(sentence, seg_info, 1., 0.15,
                                                   vocab_size, vocab)
        #return {'input_ids': sentence, 'token_type_ids': segments, 'sentence_order_label': label, 'labels': mlm_label, 'mlm_mask': mlm_mask}
        sentence = sentence.reshape([-1, seqlen, 1])
        segments = segments.reshape([-1, seqlen, 1])
        mlm_label = mlm_label.reshape([-1, 1])
        mask_pos_reshape = []
        for i, p in zip(mask_pos[0], mask_pos[1]):
            p += i * seqlen
            mask_pos_reshape.append(p)
        mask_pos = np.array(mask_pos_reshape).reshape([-1, 1])
        label = label.reshape([-1, 1])
        return sentence, segments, mlm_label, mask_pos, label

    # pretrain pipeline
    dataset = Dataset.from_list(gz_files)
    log.info('Apply sharding in distribution env %d/%d' %
             (global_rank, world_size))
    dataset = dataset.shard(world_size, global_rank)
    log.info('read from %s' % ','.join(list(iter(dataset))))
    cycle_length = len(range(global_rank, len(gz_files), world_size))
    if is_train:
        dataset = dataset.repeat()
    dataset = dataset.interleave(
        map_fn=bb_to_segments, cycle_length=cycle_length, block_length=1)
    dataset = dataset.shuffle(
        buffer_size=10000)  # must shuffle to ensure negative sample randomness
    dataset = sample_negative(dataset)

    dataset = dataset.padded_batch(batch_size, (0, -1, 0, 0), max_seqlen) \
                     .map(after)
    dataset.name = name
    return dataset