Beispiel #1
0
def dump_node_feat(config):
    log.info("Dump node feat starting...")
    id2str = [
        line.strip("\n").split("\t")[-1]
        for line in io.open(os.path.join(config.graph_work_path, "terms.txt"),
                            encoding=config.encoding)
    ]
    if "tiny" in config.ernie_name:
        tokenizer = ErnieTinyTokenizer.from_pretrained(config.ernie_name)
        #tokenizer.vocab = tokenizer.sp_model.vocab
        term_ids = [
            partial(term2id, tokenizer=tokenizer,
                    max_seqlen=config.max_seqlen)(s) for s in id2str
        ]
    else:
        tokenizer = ErnieTokenizer.from_pretrained(config.ernie_name)
        pool = multiprocessing.Pool()
        term_ids = pool.map(
            partial(term2id, tokenizer=tokenizer,
                    max_seqlen=config.max_seqlen), id2str)
        pool.terminate()
    node_feat_path = os.path.join(config.graph_work_path, "node_feat")
    if not os.path.exists(node_feat_path):
        os.makedirs(node_feat_path)
    np.save(os.path.join(config.graph_work_path, "node_feat", "term_ids.npy"),
            np.array(term_ids, np.uint16))
    log.info("Dump node feat done.")
Beispiel #2
0
 def _initialize(self):
     """
     initialize with the necessary elements
     """
     self.tokenizer = ErnieTokenizer.from_pretrained(
         "ernie-1.0", mask_token=None)
     self.rev_dict = {v: k for k, v in self.tokenizer.vocab.items()}
     self.rev_lookup = np.vectorize(lambda i: self.rev_dict[i])
     self._model = None
Beispiel #3
0
def init_lstm_var(args):
    #different language has different tokenizer
    if args.language == "ch":
        tokenizer = ErnieTokenizer.from_pretrained(args.vocab_path)
        padding_idx = tokenizer.vocab.get('[PAD]')
        tokenizer.inverse_vocab = [
            item[0]
            for item in sorted(tokenizer.vocab.items(), key=lambda x: x[1])
        ]
    else:
        vocab = Vocab.load_vocabulary(args.vocab_path,
                                      unk_token='[UNK]',
                                      pad_token='[PAD]')
        tokenizer = CharTokenizer(vocab)
        padding_idx = vocab.token_to_idx.get('[PAD]', 0)

    trans_fn = partial(convert_example,
                       tokenizer=tokenizer,
                       is_test=True,
                       language=args.language)

    #init attention layer
    lstm_hidden_size = 196
    attention = SelfInteractiveAttention(hidden_size=2 * lstm_hidden_size)
    model = BiLSTMAttentionModel(attention_layer=attention,
                                 vocab_size=len(tokenizer.vocab),
                                 lstm_hidden_size=lstm_hidden_size,
                                 num_classes=2,
                                 padding_idx=padding_idx)

    # Reads data and generates mini-batches.
    dev_ds = SentiData().read(os.path.join(args.data_dir, 'dev'),
                              args.language)
    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=padding_idx),  # input_ids
        Stack(dtype="int64"),  # seq len
    ): [data for data in fn(samples)]

    dev_loader = create_dataloader(dev_ds,
                                   trans_fn=trans_fn,
                                   batch_size=args.batch_size,
                                   mode='validation',
                                   batchify_fn=batchify_fn)

    return model, tokenizer, dev_loader
        help='max_train_steps, set this to EPOCH * NUM_SAMPLES / BATCH_SIZE')
    parser.add_argument('--warmup_proportion', type=float, default=0.1)
    parser.add_argument('--lr', type=float, default=5e-5, help='learning rate')
    parser.add_argument('--eval', action='store_true')
    parser.add_argument('--save_dir',
                        type=str,
                        default=None,
                        help='model output directory')
    parser.add_argument('--wd',
                        type=float,
                        default=0.01,
                        help='weight decay, aka L2 regularizer')

    args = parser.parse_args()

    tokenizer = ErnieTokenizer.from_pretrained(args.from_pretrained)
    #tokenizer = ErnieTinyTokenizer.from_pretrained(args.from_pretrained)

    place = F.CUDAPlace(0)
    with FD.guard(place):
        model = ErnieModelForSequenceClassification.from_pretrained(
            args.from_pretrained, num_labels=3, name='')
        if not args.eval:
            feature_column = propeller.data.FeatureColumns([
                propeller.data.TextColumn('seg_a',
                                          unk_id=tokenizer.unk_id,
                                          vocab_dict=tokenizer.vocab,
                                          tokenizer=tokenizer.tokenize),
                propeller.data.LabelColumn('label'),
            ])
Beispiel #5
0
    parser.add_argument('--init_checkpoint',
                        type=str,
                        default=None,
                        help='checkpoint to warm start from')
    parser.add_argument('--debug', action='store_true')
    parser.add_argument('--use_type', action='store_true')
    parser.add_argument('--ohem_ratio', type=float, default=0)
    parser.add_argument('--use_test_data', action='store_true')
    parser.add_argument('--use_nil_as_cand', action='store_true')
    parser.add_argument('--kfold', type=int, default=None)
    parser.add_argument('--save_steps', type=int, default=1000)
    parser.add_argument('--eval_steps', type=int, default=1000)
    parser.add_argument('--use_dev_data', action='store_true')
    args = parser.parse_args()

    tokenizer = ErnieTokenizer.from_pretrained(args.from_pretrained,
                                               special_token_list=['###'])
    #tokenizer = ErnieTinyTokenizer.from_pretrained(args.from_pretrained)

    kb = read_json('./data/data60899/kb.json')
    if args.use_type:
        train = read_json('work/result/train_nil_all.json')
        dev = read_json('work/result/dev_nil_all.json')
    else:
        train = read_json('./data/data60899/train.json')
        dev = read_json('./data/data60899/dev.json')

    if args.kfold is not None:
        print('reading fold %s data...' % args.kfold)
        train = read_json('./work/data/train_fold%s.json' % args.kfold)
    if args.use_test_data:
        train = train + read_json('work/result/result.json')
                        default=None,
                        help='inference model output directory')
    parser.add_argument('--init_checkpoint', type=str, default=None)
    parser.add_argument('--save_dir',
                        type=str,
                        default=None,
                        help='model output directory')
    parser.add_argument('--wd',
                        type=float,
                        default=0.01,
                        help='weight decay, aka L2 regularizer')

    args = parser.parse_args()

    place = F.CUDAPlace(D.parallel.Env().dev_id)
    D.guard(place).__enter__()

    ernie = ErnieModelForGeneration.from_pretrained(args.from_pretrained)
    tokenizer = ErnieTokenizer.from_pretrained(args.from_pretrained,
                                               mask_token=None)
    rev_dict = {v: k for k, v in tokenizer.vocab.items()}
    rev_dict[tokenizer.pad_id] = ''  # replace [PAD]
    rev_dict[tokenizer.unk_id] = ''  # replace [PAD]

    if args.init_checkpoint is not None:
        log.info('loading checkpoint from %s' % args.init_checkpoint)
        sd, _ = D.load_dygraph(args.init_checkpoint)
        ernie.set_dict(sd)

    seq2seq(ernie, tokenizer, args)
Beispiel #7
0
from ernie.optimization import AdamW, LinearDecay

# 本例子采用chnsenticorp中文情感识别任务作为示范;并且事先通过数据增强扩充了蒸馏所需的无监督数据
#
# 请从“”下载数据;并数据存放在 ./chnsenticorp-data/
# 数据分为3列:原文;空格切词;情感标签
# 其中第一列为ERNIE的输入;第二列为BoW词袋模型的输入
# 事先统计好的BoW 词典在 ./chnsenticorp-data/vocab.bow.txt

# 定义finetune teacher模型所需要的超参数
SEQLEN = 256
BATCH = 32
EPOCH = 10
LR = 5e-5

tokenizer = ErnieTokenizer.from_pretrained('ernie-1.0')

student_vocab = {
    i.strip(): l
    for l, i in enumerate(
        open('./chnsenticorp-data/vocab.bow.txt').readlines())
}


def space_tokenizer(i):
    return i.decode('utf8').split()


feature_column = propeller.data.FeatureColumns([
    propeller.data.TextColumn('seg_a',
                              unk_id=tokenizer.unk_id,
Beispiel #8
0
MODEL_DIR = './model/ernie1.0.1'
OUTPUT_PATH = './data/ernie_output.csv'
SUBSITUTION_NUM = 10

eval_path = EVAL_PATH
model_dir = MODEL_DIR
substitution_num = SUBSITUTION_NUM
output_path = OUTPUT_PATH

sentences, difficult_words = read_dataset(eval_path)

place = F.CUDAPlace(D.parallel.Env().dev_id)
D.guard(place).__enter__()

# 初始化tokenizer
tokenizer = ErnieTokenizer.from_pretrained(model_dir)
rev_dict = {v: k for k, v in tokenizer.vocab.items()}
rev_dict[tokenizer.pad_id] = ''  # replace [PAD]
rev_dict[tokenizer.unk_id] = ''  # replace [PAD]


@np.vectorize
def rev_lookup(i):
    return rev_dict[i]


ernie = ErnieGenerate.from_pretrained(model_dir)

for sentence, difficult_word in zip(sentences, difficult_words):
    print(sentence, difficult_word)
    # 词预测
Beispiel #9
0
def load_tokenizer(ernie_name):
    if "tiny" in config.ernie_name:
        tokenizer = ErnieTinyTokenizer.from_pretrained(ernie_name)
    else:
        tokenizer = ErnieTokenizer.from_pretrained(ernie_name)
    return tokenizer
Beispiel #10
0
    else:
        sampler = paddle.io.BatchSampler(dataset=dataset,
                                         batch_size=batch_size,
                                         shuffle=shuffle)
    dataloader = paddle.io.DataLoader(dataset,
                                      batch_sampler=sampler,
                                      collate_fn=batchify_fn)
    return dataloader


if __name__ == "__main__":
    paddle.set_device(args.device)
    set_seed()

    if args.language == 'ch':
        tokenizer = ErnieTokenizer.from_pretrained(args.vocab_path)

        # Loads dataset.
        train_ds, dev_ds = load_dataset(
            "chnsenticorp", splits=["train", "dev"]
        )  # train_ds, dev_ds: <class 'paddlenlp.datasets.dataset.MapDataset'>

        # Constructs the newtork.
        vocab_size = len(tokenizer.vocab)
        num_classes = len(train_ds.label_list)
        pad_token_id = tokenizer.vocab.get('[PAD]')
        pad_value = tokenizer.vocab.get('[PAD]', 0)
    else:
        # Loads vocab.
        if not os.path.exists(args.vocab_path):
            raise RuntimeError(