Ejemplo n.º 1
0
    def __init__(self,
                 ctx=mx.cpu(),
                 dtype='float32',
                 model='bert_12_768_12',
                 dataset_name='book_corpus_wiki_en_uncased',
                 params_path=None,
                 max_seq_length=25,
                 batch_size=256,
                 sentencepiece=None,
                 root=os.path.join(get_home_dir(), 'models')):
        self.ctx = ctx
        self.dtype = dtype
        self.max_seq_length = max_seq_length
        self.batch_size = batch_size
        self.dataset_name = dataset_name

        # use sentencepiece vocab and a checkpoint
        # we need to set dataset_name to None, otherwise it uses the downloaded vocab
        if params_path and sentencepiece:
            dataset_name = None
        else:
            dataset_name = self.dataset_name
        if sentencepiece:
            vocab = gluonnlp.vocab.BERTVocab.from_sentencepiece(sentencepiece)
        else:
            vocab = None

        self.bert, self.vocab = gluonnlp.model.get_model(
            model,
            dataset_name=dataset_name,
            pretrained=params_path is None,
            ctx=self.ctx,
            use_pooler=False,
            use_decoder=False,
            use_classifier=False,
            root=root,
            vocab=vocab)

        self.bert.cast(self.dtype)
        if params_path:
            logger.info('Loading params from %s', params_path)
            self.bert.load_parameters(params_path,
                                      ctx=ctx,
                                      ignore_extra=True,
                                      cast_dtype=True)

        lower = 'uncased' in self.dataset_name
        if sentencepiece:
            self.tokenizer = BERTSPTokenizer(sentencepiece,
                                             self.vocab,
                                             lower=lower)
        else:
            self.tokenizer = BERTTokenizer(self.vocab, lower=lower)
        self.transform = BERTSentenceTransform(
            tokenizer=self.tokenizer,
            max_seq_length=self.max_seq_length,
            pair=False)
Ejemplo n.º 2
0
 def __init__(self,
              src_vocab=None,
              tgt_vocab=None,
              max_src_len=None,
              max_tgt_len=None):
     self.src_vocab = src_vocab
     self.tgt_vocab = tgt_vocab
     self.max_src_len = max_src_len
     self.max_tgt_len = max_tgt_len
     self.bert_src_tokenzier = BERTTokenizer(src_vocab)
     self.bert_tgt_tokenzier = BERTTokenizer(tgt_vocab)
Ejemplo n.º 3
0
 def __init__(self,
              en_vocab=None,
              ch_vocab=None,
              max_en_len=None,
              max_ch_len=None):
     self.en_vocab = en_vocab
     self.ch_vocab = ch_vocab
     self.max_en_len = max_en_len
     self.max_ch_len = max_ch_len
     self.bert_en_tokenzier = BERTTokenizer(en_vocab)
     self.bert_ch_tokenzier = BERTTokenizer(ch_vocab)
Ejemplo n.º 4
0
    def __init__(self, ctx=mx.cpu(), dtype='float32', model='bert_12_768_12',
                 dataset_name='book_corpus_wiki_en_uncased', params_path=None,
                 max_seq_length=25, batch_size=256,
                 root=os.path.join(get_home_dir(), 'models')):
        self.ctx = ctx
        self.dtype = dtype
        self.max_seq_length = max_seq_length
        self.batch_size = batch_size
        self.dataset_name = dataset_name

        # Don't download the pretrained models if we have a parameter path
        self.bert, self.vocab = gluonnlp.model.get_model(model,
                                                         dataset_name=self.dataset_name,
                                                         pretrained=params_path is None,
                                                         ctx=self.ctx,
                                                         use_pooler=False,
                                                         use_decoder=False,
                                                         use_classifier=False,
                                                         root=root)
        self.bert.cast(self.dtype)

        if params_path:
            logger.info('Loading params from %s', params_path)
            self.bert.load_parameters(params_path, ctx=ctx, ignore_extra=True)

        lower = 'uncased' in self.dataset_name
        self.tokenizer = BERTTokenizer(self.vocab, lower=lower)
        self.transform = BERTSentenceTransform(tokenizer=self.tokenizer,
                                               max_seq_length=self.max_seq_length,
                                               pair=False)
Ejemplo n.º 5
0
 def data_loader(self, sentences, shuffle=False):
     tokenizer = BERTTokenizer(self.vocab)
     transform = BERTSentenceTransform(tokenizer=tokenizer,
                                       max_seq_length=self.max_seq_length,
                                       pair=False)
     dataset = BertEmbeddingDataset(sentences, transform)
     return DataLoader(dataset=dataset, batch_size=self.batch_size, shuffle=shuffle)
Ejemplo n.º 6
0
def main():
    """
    main function
    """
    logging.info('loading vocab file')
    vocab_obj = nlp.Vocab.from_json(open(args.vocab_file, 'rt').read())
    tokenizer = BERTTokenizer(
        vocab=vocab_obj, lower=args.do_lower_case)

    input_files = []
    for input_pattern in args.input_file.split(','):
        input_files.extend(glob.glob(input_pattern))

    logging.info('*** Reading from input files ***')
    for input_file in input_files:
        logging.info('  %s', input_file)

    rng = random.Random(args.random_seed)
    instances = create_training_instances(
        input_files, tokenizer, args.max_seq_length, args.dupe_factor,
        args.short_seq_prob, args.masked_lm_prob, args.max_predictions_per_seq,
        rng)

    output_files = args.output_file.split(',')
    logging.info('*** Writing to output files ***')
    for output_file in output_files:
        logging.info('  %s', output_file)

    write_instance_to_example_files(instances, tokenizer, args.max_seq_length,
                                    args.max_predictions_per_seq, output_files)
Ejemplo n.º 7
0
def get_bert_datasets(class_labels,
                      vectorizer,
                      train_ds,
                      dev_ds,
                      batch_size,
                      max_len,
                      bert_model_name = 'bert_12_768_12',
                      bert_dataset = 'book_corpus_wiki_en_uncased',
                      pad=False,
                      use_bert_vocab=False,
                      ctx=mx.cpu()):
    bert, bert_vocabulary = get_model(
        name=bert_model_name,
        dataset_name=bert_dataset,
        pretrained=True,
        ctx=ctx,
        use_pooler=True,
        use_decoder=False,
        use_classifier=False)
    do_lower_case = 'uncased' in bert_dataset    
    bert_tokenizer = BERTTokenizer(bert_vocabulary, lower=do_lower_case)
    trans = BERTDatasetTransform(bert_tokenizer, max_len,
                                 class_labels=class_labels,
                                 label_alias=None,
                                 pad=pad, pair=False,
                                 has_label=True,
                                 vectorizer=vectorizer,
                                 bert_vocab_size = len(bert_vocabulary) if use_bert_vocab else 0)
    train_data, dev_data, test_data, num_train_examples = preprocess_data(
        trans, class_labels, train_ds, dev_ds, batch_size, max_len, pad)
    return train_data, dev_data, num_train_examples, bert, bert_vocabulary
Ejemplo n.º 8
0
def main():
    """Main function."""
    time_start = time.time()
    logging.info('loading vocab file from dataset: %s', args.vocab)
    vocab_obj = nlp.data.utils._load_pretrained_vocab(args.vocab)
    tokenizer = BERTTokenizer(vocab=vocab_obj, lower='uncased' in args.vocab)

    input_files = []
    for input_pattern in args.input_file.split(','):
        input_files.extend(glob.glob(os.path.expanduser(input_pattern)))

    logging.info('*** Reading from %d input files ***', len(input_files))
    for input_file in input_files:
        logging.info('  %s', input_file)

    num_outputs = args.num_outputs
    assert len(input_files) >= num_outputs, \
        'Number of outputs must be fewer than that of inputs'

    output_dir = os.path.expanduser(args.output_dir)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    rng = random.Random(args.random_seed)
    nworker = args.num_workers

    # calculate the number of splits
    file_splits = []
    split_size = (len(input_files) + num_outputs - 1) // num_outputs
    for i in range(num_outputs - 1):
        file_splits.append(input_files[i * split_size:(i + 1) * split_size])
    file_splits.append(input_files[(num_outputs - 1) * split_size:])

    # prepare workload
    suffix = 'npz' if args.format == 'numpy' else 'rec'
    count = 0
    map_args = []
    pool_args = (tokenizer, args.max_seq_length, args.dupe_factor,\
                 args.short_seq_prob, args.masked_lm_prob,
                 args.max_predictions_per_seq, rng)
    for i, file_split in enumerate(file_splits):
        out = os.path.join(output_dir,
                           'part-{}.{}'.format(str(i).zfill(3), suffix))
        count += len(file_split)
        map_args.append((file_split, out) + pool_args)

    # sanity check
    assert count == len(input_files)

    # dispatch to workers
    if nworker > 0:
        pool = Pool(nworker)
        pool.map(create_training_instances, map_args)
    else:
        for map_arg in map_args:
            create_training_instances(map_arg)

    time_end = time.time()
    logging.info('Time cost=%.1f', time_end - time_start)
Ejemplo n.º 9
0
    def __init__(self,
                 ctx=mx.cpu(),
                 dtype='float32',
                 model='bert_12_768_12',
                 dataset_name='book_corpus_wiki_en_uncased',
                 params_path=None,
                 max_seq_length=25,
                 batch_size=256):
        """
        Encoding from BERT model.

        Parameters
        ----------
        ctx : Context.
            running BertEmbedding on which gpu device id.
        dtype: str
        data type to use for the model.
        model : str, default bert_12_768_12.
            pre-trained BERT model
        dataset_name : str, default book_corpus_wiki_en_uncased.
            pre-trained model dataset
        params_path: str, default None
            path to a parameters file to load instead of the pretrained model.
        max_seq_length : int, default 25
            max length of each sequence
        batch_size : int, default 256
            batch size
        """
        self.ctx = ctx
        self.dtype = dtype
        self.max_seq_length = max_seq_length
        self.batch_size = batch_size
        self.dataset_name = dataset_name
        if params_path is not None:
            # Don't download the pretrained models if we have a parameter path
            pretrained = False
        else:
            pretrained = True
        self.bert, self.vocab = gluonnlp.model.get_model(
            model,
            dataset_name=self.dataset_name,
            pretrained=pretrained,
            ctx=self.ctx,
            use_pooler=False,
            use_decoder=False,
            use_classifier=False)
        self.bert.cast(self.dtype)

        if params_path:
            logger.info('Loading params from %s', params_path)
            self.bert.load_parameters(params_path, ctx=ctx, ignore_extra=True)

        lower = 'uncased' in self.dataset_name

        self.tokenizer = BERTTokenizer(self.vocab, lower=lower)
        self.transform = BERTSentenceTransform(
            tokenizer=self.tokenizer,
            max_seq_length=self.max_seq_length,
            pair=False)
def test_bert_dataset_transform():
    text_a = u'is this jacksonville ?'
    text_b = u'no it is not'
    label_cls = 0
    vocab_tokens = [
        'is', 'this', 'jack', '##son', '##ville', '?', 'no', 'it', 'is', 'not'
    ]

    bert_vocab = BERTVocab(count_tokens(vocab_tokens))
    tokenizer = BERTTokenizer(vocab=bert_vocab)

    # test BERTDatasetTransform for classification task
    bert_cls_dataset_t = BERTDatasetTransform(tokenizer,
                                              15,
                                              labels=[label_cls],
                                              pad=True,
                                              pair=True,
                                              label_dtype='int32')
    token_ids, length, type_ids, label_ids = bert_cls_dataset_t(
        (text_a, text_b, label_cls))

    text_a_tokens = ['is', 'this', 'jack', '##son', '##ville', '?']
    text_b_tokens = ['no', 'it', 'is', 'not']
    text_a_ids = bert_vocab[text_a_tokens]
    text_b_ids = bert_vocab[text_b_tokens]

    cls_ids = bert_vocab[[bert_vocab.cls_token]]
    sep_ids = bert_vocab[[bert_vocab.sep_token]]
    pad_ids = bert_vocab[[bert_vocab.padding_token]]

    concated_ids = cls_ids + text_a_ids + sep_ids + text_b_ids + sep_ids + pad_ids
    valid_token_ids = np.array([pad_ids[0]] * 15, dtype=np.int32)
    for i, x in enumerate(concated_ids):
        valid_token_ids[i] = x
    valid_type_ids = np.zeros((15, ), dtype=np.int32)
    start = len(text_a_tokens) + 2
    end = len(text_a_tokens) + 2 + len(text_b_tokens) + 1
    valid_type_ids[start:end] = 1

    assert all(token_ids == valid_token_ids)
    assert length == len(vocab_tokens) + 3
    assert all(type_ids == valid_type_ids)
    assert all(label_ids == np.array([label_cls], dtype=np.int32))

    # test BERTDatasetTransform for regression task
    label_reg = 0.2
    bert_reg_dataset_t = BERTDatasetTransform(tokenizer,
                                              15,
                                              pad=True,
                                              pair=True,
                                              label_dtype='float32')
    token_ids, length, type_ids, label_reg_val = bert_reg_dataset_t(
        (text_a, text_b, label_reg))
    assert all(token_ids == valid_token_ids)
    assert length == len(vocab_tokens) + 3
    assert all(type_ids == valid_type_ids)
    assert all(label_reg_val == np.array([label_reg], dtype=np.float32))
Ejemplo n.º 11
0
def summarize(sentences, transformer, src_vocab, tgt_vocab):
    tokenzier = BERTTokenizer(src_vocab)
    sentences = tokenzier(sentences)
    sent_idx = src_vocab.to_indices(sentences)
    sent_idx = nd.array([sent_idx])
    Y_h = _summarize(transformer, sent_idx, tgt_vocab)
    Y_h = Y_h[0].asnumpy().tolist()
    Y_h = list(map(int, Y_h))
    predict = tgt_vocab.to_tokens(Y_h)
    return predict
Ejemplo n.º 12
0
def load_dataset_bert(json_file,
                      voc_size,
                      json_text_key="text",
                      json_sp_key="sp_vec",
                      max_len=64,
                      ctx=mx.cpu()):
    indices = []
    values = []
    indptrs = [0]
    cumulative = 0
    total_num_words = 0
    ndocs = 0
    bert_model = 'bert_12_768_12'
    dname = 'book_corpus_wiki_en_uncased'
    bert_base, vocab = nlp.model.get_model(bert_model,
                                           dataset_name=dname,
                                           pretrained=True,
                                           ctx=ctx,
                                           use_pooler=True,
                                           use_decoder=False,
                                           use_classifier=False)
    tokenizer = BERTTokenizer(vocab)
    transform = BERTSentenceTransform(tokenizer, max_len, pair=False)
    x_ids = []
    x_val_lens = []
    x_segs = []
    with io.open(json_file, 'r', encoding='utf-8') as fp:
        for line in fp:
            if json_text_key:
                js = json.loads(line)
                line = js[json_text_key]
            if len(line.split(' ')) > 4:
                ids, lens, segs = transform(
                    (line, ))  # create BERT-ready inputs
                x_ids.append(ids)
                x_val_lens.append(lens)
                x_segs.append(segs)
            ## Now, get the sparse vector
            ndocs += 1
            sp_vec_els = js[json_sp_key]
            n_pairs, inds, vs = get_single_vec(sp_vec_els)
            cumulative += n_pairs
            total_num_words += sum(vs)
            indptrs.append(cumulative)
            values.extend(vs)
            indices.extend(inds)
    csr_mat = mx.nd.sparse.csr_matrix((values, indices, indptrs),
                                      shape=(ndocs, voc_size))
    data_train = gluon.data.ArrayDataset(
        mx.nd.array(x_ids, dtype='int32'),
        mx.nd.array(x_val_lens, dtype='int32'),
        mx.nd.array(x_segs, dtype='int32'), csr_mat.tostype('default'))
    return data_train, bert_base, vocab, csr_mat
Ejemplo n.º 13
0
def get_bert_datasets(class_labels,
                      vectorizer,
                      train_ds,
                      dev_ds,
                      batch_size,
                      max_len,
                      aux_ds=None,
                      bert_model_name='bert_12_768_12',
                      bert_dataset='book_corpus_wiki_en_uncased',
                      pad=False,
                      use_bert_vocab=False,
                      label_alias=None,
                      num_classes=None,
                      ctx=mx.cpu()):
    if class_labels is None and num_classes is None:
        raise Exception("Must provide class_labels or num_classes")
    bert, bert_vocabulary = get_model(name=bert_model_name,
                                      dataset_name=bert_dataset,
                                      pretrained=True,
                                      ctx=ctx,
                                      use_pooler=True,
                                      use_decoder=False,
                                      use_classifier=False)
    do_lower_case = 'uncased' in bert_dataset
    bert_tokenizer = BERTTokenizer(bert_vocabulary, lower=do_lower_case)
    trans = BERTDatasetTransform(
        bert_tokenizer,
        max_len,
        class_labels=class_labels,
        label_alias=label_alias,
        pad=pad,
        pair=False,
        has_label=True,
        vectorizer=vectorizer,
        bert_vocab_size=len(bert_vocabulary) if use_bert_vocab else 0,
        num_classes=num_classes)
    train_data, num_train_examples = preprocess_seq_data(trans,
                                                         class_labels,
                                                         train_ds,
                                                         batch_size,
                                                         max_len,
                                                         train_mode=True,
                                                         pad=pad,
                                                         aux_dataset=aux_ds)
    dev_data, _ = preprocess_seq_data(trans,
                                      class_labels,
                                      dev_ds,
                                      batch_size,
                                      max_len,
                                      train_mode=False,
                                      pad=pad)
    return train_data, dev_data, num_train_examples, bert, bert_vocabulary
Ejemplo n.º 14
0
def _load_dataset_bert(line_gen, voc_size, max_len=64, ctx=mx.cpu()):
    indices = []
    values = []
    indptrs = [0]
    cumulative = 0
    total_num_words = 0
    ndocs = 0
    bert_model = 'bert_12_768_12'
    dname = 'book_corpus_wiki_en_uncased'
    ## This is really only needed here to get the vocab
    ## GluonNLP API doesn't enable that
    bert_base, vocab = nlp.model.get_model(bert_model,
                                           dataset_name=dname,
                                           pretrained=True,
                                           ctx=ctx,
                                           use_pooler=True,
                                           use_decoder=False,
                                           use_classifier=False)
    tokenizer = BERTTokenizer(vocab)
    transform = BERTSentenceTransform(tokenizer, max_len, pair=False)
    x_ids = []
    x_val_lens = []
    x_segs = []
    for t in line_gen:
        if isinstance(t, tuple):
            line = t[0]
            sp_vec_els = t[1]
        else:
            line = t
            sp_vec_els = None
        ids, lens, segs = transform((line, ))  # create BERT-ready inputs
        x_ids.append(ids)
        x_val_lens.append(lens)
        x_segs.append(segs)
        ## Now, get the sparse vector
        ndocs += 1
        if sp_vec_els:
            pairs, inds, vs = get_single_vec(sp_vec_els)
            cumulative += len(pairs)
            total_num_words += sum(vs)
            indptrs.append(cumulative)
            values.extend(vs)
            indices.extend(inds)
    if len(indices) > 0:
        csr_mat = mx.nd.sparse.csr_matrix(
            (values, indices, indptrs),
            shape=(ndocs, voc_size)).tostype('default')
    else:
        csr_mat = None
    return x_ids, x_val_lens, x_segs, bert_base, vocab, csr_mat
Ejemplo n.º 15
0
 def __init__(self,
              model,
              bert_vocab,
              max_length,
              bow_vocab=None,
              ctx=mx.cpu()):
     super().__init__(ctx)
     self.model = model
     self.bert_base = model.bert
     self.tokenizer = BERTTokenizer(bert_vocab)
     self.transform = BERTSentenceTransform(self.tokenizer,
                                            max_length,
                                            pair=False)
     self.bow_vocab = bow_vocab
Ejemplo n.º 16
0
def get_dual_bert_datasets(class_labels,
                           vectorizer,
                           train_ds1,
                           train_ds2,
                           model_name,
                           dataset,
                           batch_size,
                           dev_bs,
                           max_len1,
                           max_len2,
                           pad,
                           use_bert_vocab=False,
                           shuffle=True,
                           ctx=mx.cpu()):
    bert, bert_vocabulary = get_model(
        name=model_name,
        dataset_name=dataset,
        pretrained=True,
        ctx=ctx,
        use_pooler=True,
        use_decoder=False,
        use_classifier=False)
    do_lower_case = 'uncased' in dataset    
    bert_tokenizer = BERTTokenizer(bert_vocabulary, lower=do_lower_case)

    # transformation for data train and dev
    trans1 = BERTDatasetTransform(bert_tokenizer, max_len1,
                                  class_labels=class_labels,
                                  label_alias=None,
                                  pad=pad, pair=False,
                                  has_label=True,
                                  vectorizer=vectorizer,
                                  bert_vocab_size=len(bert_vocabulary) if use_bert_vocab else 0)

    trans2 = BERTDatasetTransform(bert_tokenizer, max_len2,
                                  class_labels=class_labels,
                                  label_alias=None,
                                  pad=pad, pair=False,
                                  has_label=True,
                                  vectorizer=vectorizer,
                                  bert_vocab_size=len(bert_vocabulary) if use_bert_vocab else 0)
    
    #train_data, num_train_examples = preprocess_data_metriclearn(
    #   trans, class_labels, train_ds1, train_ds2, batch_size, max_len, pad)
    batch_size = len(train_ds2)
    a_train_data, num_train_examples, b_train_data = preprocess_data_metriclearn_separate(
        trans1, trans2, class_labels, train_ds1, train_ds2, batch_size, shuffle=shuffle)
    return a_train_data, num_train_examples, bert, b_train_data, bert_vocabulary
Ejemplo n.º 17
0
 def __init__(self,
              model,
              bert_vocab,
              max_length,
              bow_vocab=None,
              pre_vectorizer=None,
              ctx=mx.cpu()):
     super().__init__(ctx)
     self.model = model
     self.bert_base = model.bert
     self.tokenizer = BERTTokenizer(bert_vocab)
     self.transform = BERTSentenceTransform(self.tokenizer,
                                            max_length,
                                            pair=False)
     self.bow_vocab = bow_vocab
     self.vectorizer = pre_vectorizer or TMNTVectorizer(
         initial_vocabulary=bow_vocab)
Ejemplo n.º 18
0
def word_piece_tokenizer(sentences):
    ctx = ghp.ctx
    model = 'bert_12_768_12'
    dataset_name = 'book_corpus_wiki_en_uncased'
    max_seq_length = ghp.max_seq_len
    batch_size = 256
    _, vocab = gluonnlp.model.get_model(model,
                                        dataset_name=dataset_name,
                                        pretrained=True,
                                        ctx=ctx,
                                        use_pooler=False,
                                        use_decoder=False,
                                        use_classifier=False)
    tokenizer = BERTTokenizer(vocab)

    transform = BERTSentenceTransform(tokenizer=tokenizer,
                                      max_seq_length=max_seq_length,
                                      pair=False)
    dataset = BertEmbeddingDataset(sentences, transform)
    data_loader = DataLoader(dataset=dataset,
                             batch_size=batch_size,
                             shuffle=False)
    batches = []
    for token_ids, _, _ in data_loader:
        token_ids = token_ids.as_in_context(ctx)

        for token_id in token_ids.asnumpy():
            batches.append(token_id)

    cut_results = []
    for token_ids in batches:
        tokens = []
        for token_id in token_ids:
            if token_id == 1:
                break
            if token_id in (2, 3):
                continue
            token = vocab.idx_to_token[token_id]
            if token.startswith('##'):
                token = token[2:]
                tokens[-1] += token
            else:  # iv, avg last oov
                tokens.append(token)
        cut_results.append(tokens)
    return cut_results
Ejemplo n.º 19
0
    def build_model(self, args, model_args, ctx, dataset=None, vocab=None):
        dataset = model_args.model_name
        if model_args.model_type == 'bert':
            model_name = 'bert_12_768_12'
        elif model_args.model_type == 'bertl':
            model_name = 'bert_24_1024_16'
        elif model_args.model_type == 'roberta':
            model_name = 'roberta_12_768_12'
        elif model_args.model_type == 'robertal':
            model_name = 'roberta_24_1024_16'
        else:
            raise NotImplementedError
        self.is_roberta = model_args.model_type.startswith('roberta')

        if args.model_params is None:
            pretrained = True
        else:
            pretrained = False
        bert, vocabulary = nlp.model.get_model(
            name=model_name,
            dataset_name=dataset,
            pretrained=pretrained,
            ctx=ctx,
            use_pooler=False if self.is_roberta else True,
            use_decoder=False,
            use_classifier=False)
        if args.model_params:
            bert.load_parameters(args.model_params, ctx=ctx, cast_dtype=True, ignore_extra=True)
        if args.fix_bert_weights:
            bert.collect_params('.*weight|.*bias').setattr('grad_req', 'null')

        if vocab:
            vocabulary = vocab
        do_lower_case = 'uncased' in dataset
        task_name = args.task_name
        num_classes = self.task.num_classes()
        if self.is_roberta:
            model = RoBERTaClassifier(bert, dropout=0.0, num_classes=num_classes)
            self.tokenizer = nlp.data.GPT2BPETokenizer()
        else:
            model = BERTClassifier(bert, num_classes=num_classes, dropout=model_args.dropout)
            self.tokenizer = BERTTokenizer(vocabulary, lower=do_lower_case)

        return model, vocabulary
Ejemplo n.º 20
0
 def __init__(self,
              param_file=None,
              config_file=None,
              vocab_file=None,
              model_dir=None,
              ctx=mx.cpu()):
     super().__init__(ctx)
     if model_dir is not None:
         param_file = os.path.join(model_dir, 'model.params')
         vocab_file = os.path.join(model_dir, 'vocab.json')
         config_file = os.path.join(model_dir, 'model.config')
     with open(config_file) as f:
         config = json.loads(f.read())
     with open(vocab_file) as f:
         voc_js = f.read()
     self.bow_vocab = nlp.Vocab.from_json(voc_js)
     self.ctx = ctx
     self.bert_base, self.vocab = nlp.model.get_model(
         'bert_12_768_12',
         dataset_name='book_corpus_wiki_en_uncased',
         pretrained=True,
         ctx=ctx,
         use_pooler=True,
         use_decoder=False,
         use_classifier=False)  #, output_attention=True)
     self.latent_dist = config['latent_distribution']['dist_type']
     self.n_latent = config['n_latent']
     self.kappa = config['latent_distribution']['kappa']
     self.pad_id = self.vocab[self.vocab.padding_token]
     self.max_sent_len = config['sent_size']
     self.model = BertBowVED(self.bert_base,
                             self.bow_vocab,
                             latent_distrib=self.latent_dist,
                             n_latent=self.n_latent,
                             kappa=self.kappa,
                             batch_size=1)
     self.tokenizer = BERTTokenizer(self.vocab)
     self.transform = BERTSentenceTransform(self.tokenizer,
                                            self.max_sent_len,
                                            pair=False)
     self.model.load_parameters(str(param_file),
                                allow_missing=False,
                                ignore_extra=True)
def main():
    """Main function."""
    time_start = time.time()

    # random seed
    random.seed(args.random_seed)

    # create output dir
    output_dir = os.path.expanduser(args.output_dir)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # vocabulary
    logging.info('loading vocab file from dataset: %s', args.vocab)
    vocab = nlp.data.utils._load_pretrained_vocab(args.vocab,
                                                  root=output_dir,
                                                  cls=nlp.vocab.BERTVocab)
    tokenizer = BERTTokenizer(vocab=vocab, lower='uncased' in args.vocab)

    # count the number of input files
    input_files = []
    for input_pattern in args.input_file.split(','):
        input_files.extend(glob.glob(os.path.expanduser(input_pattern)))
    logging.info('*** Reading from %d input files ***', len(input_files))
    for input_file in input_files:
        logging.info('\t%s', input_file)
    num_outputs = min(args.num_outputs, len(input_files))

    create_training_instances(input_files,
                              tokenizer,
                              args.max_seq_length,
                              args.short_seq_prob,
                              args.masked_lm_prob,
                              args.max_predictions_per_seq,
                              vocab,
                              args.dupe_factor,
                              args.num_workers,
                              num_outputs=num_outputs,
                              output_dir=output_dir)

    time_end = time.time()
    logging.info('Time cost=%.1f', time_end - time_start)
Ejemplo n.º 22
0
def main():
    """Main function."""
    time_start = time.time()

    # random seed
    random.seed(args.random_seed)

    # create output dir
    output_dir = os.path.expanduser(args.output_dir)
    nlp.utils.mkdir(output_dir)

    # vocabulary and tokenizer
    if args.sentencepiece:
        logging.info('loading vocab file from sentence piece model: %s',
                     args.sentencepiece)
        if args.dataset_name:
            warnings.warn(
                'Both --dataset_name and --sentencepiece are provided. '
                'The vocabulary will be loaded based on --sentencepiece.')
        vocab = nlp.vocab.BERTVocab.from_sentencepiece(args.sentencepiece)
        tokenizer = nlp.data.BERTSPTokenizer(args.sentencepiece,
                                             vocab,
                                             num_best=args.sp_nbest,
                                             alpha=args.sp_alpha,
                                             lower=not args.cased)
    else:
        logging.info('loading vocab file from pre-defined dataset: %s',
                     args.dataset_name)
        vocab = nlp.data.utils._load_pretrained_vocab(args.dataset_name,
                                                      root=output_dir,
                                                      cls=nlp.vocab.BERTVocab)
        tokenizer = BERTTokenizer(vocab=vocab,
                                  lower='uncased' in args.dataset_name)

    # count the number of input files
    input_files = []
    for input_pattern in args.input_file.split(','):
        input_files.extend(glob.glob(os.path.expanduser(input_pattern)))
    for input_file in input_files:
        logging.info('\t%s', input_file)
    num_inputs = len(input_files)
    num_outputs = min(args.num_outputs, len(input_files))
    logging.info('*** Reading from %d input files ***', num_inputs)

    # calculate the number of splits
    file_splits = []
    split_size = (num_inputs + num_outputs - 1) // num_outputs
    for i in range(num_outputs):
        split_start = i * split_size
        split_end = min(num_inputs, (i + 1) * split_size)
        file_splits.append(input_files[split_start:split_end])

    # prepare workload
    count = 0
    process_args = []

    for i, file_split in enumerate(file_splits):
        output_file = os.path.join(output_dir,
                                   'part-{}.npz'.format(str(i).zfill(3)))
        count += len(file_split)
        process_args.append(
            (file_split, tokenizer, args.max_seq_length, args.short_seq_prob,
             args.masked_lm_prob, args.max_predictions_per_seq,
             args.whole_word_mask, vocab, args.dupe_factor, 1, None,
             output_file))

    # sanity check
    assert count == len(input_files)

    # dispatch to workers
    nworker = args.num_workers
    if nworker > 1:
        pool = Pool(nworker)
        pool.map(create_training_instances, process_args)
    else:
        for process_arg in process_args:
            create_training_instances(process_arg)

    time_end = time.time()
    logging.info('Time cost=%.1f', time_end - time_start)
Ejemplo n.º 23
0
    logging.info('loading bert params from {0}'.format(pretrained_bert_parameters))
    model.bert.load_parameters(pretrained_bert_parameters, ctx=ctx,
                               ignore_extra=True)
if model_parameters:
    logging.info('loading model params from {0}'.format(model_parameters))
    model.load_parameters(model_parameters, ctx=ctx)
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

logging.info(model)
model.hybridize(static_alloc=True)
loss_function.hybridize(static_alloc=True)

# data processing
do_lower_case = 'uncased' in dataset
bert_tokenizer = BERTTokenizer(vocabulary, lower=do_lower_case)


def preprocess_data(tokenizer, task, batch_size, dev_batch_size, max_len):
    """Data preparation function."""
    # transformation
    trans = BERTDatasetTransform(
        tokenizer,
        max_len,
        labels=task.get_labels(),
        pad=False,
        pair=task.is_pair,
        label_dtype='float32' if not task.get_labels() else 'int32')

    data_train = task('train').transform(trans, lazy=False)
    data_train_len = data_train.transform(
Ejemplo n.º 24
0
np.random.seed(123)
mx.random.seed(123)

dropout_prob = 0.1
ctx = mx.gpu(args.id)
bert_model, bert_vocab = nlp.model.get_model(
    name='bert_12_768_12',
    dataset_name='book_corpus_wiki_en_uncased',
    pretrained=True,
    ctx=ctx,
    use_pooler=True,
    use_decoder=False,
    use_classifier=False,
    dropout=dropout_prob,
    embed_dropout=dropout_prob)
tokenizer = BERTTokenizer(bert_vocab, lower=True)

abstract_emb = np.zeros((max_embs, 768), dtype=np.float32)

paper_map = []
fp = open(args.file, 'r')
start = time.time()
for i, line in enumerate(fp):
    paper_id, abstract = line.split('\t')
    paper_id = int(paper_id)
    tokens = tokenizer(abstract)
    if len(tokens) > 512:
        print('paper {} has strings with {} tokens'.format(
            paper_id, len(tokens)))
        tokens = tokens[0:512]
Ejemplo n.º 25
0
class BertEmbedding:
    """
    Encoding from BERT model.

    Parameters
    ----------
    ctx : Context.
        running BertEmbedding on which gpu device id.
    dtype: str
        data type to use for the model.
    model : str, default bert_12_768_12.
        pre-trained BERT model
    dataset_name : str, default book_corpus_wiki_en_uncased.
        pre-trained model dataset
    params_path: str, default None
        path to a parameters file to load instead of the pretrained model.
    max_seq_length : int, default 25
        max length of each sequence
    batch_size : int, default 256
        batch size
    sentencepiece : str, default None
        Path to the sentencepiece .model file for both tokenization and vocab
    root : str, default '$MXNET_HOME/models' with MXNET_HOME defaults to '~/.mxnet'
        Location for keeping the model parameters.
    """
    def __init__(self,
                 ctx=mx.cpu(),
                 dtype='float32',
                 model='bert_12_768_12',
                 dataset_name='book_corpus_wiki_en_uncased',
                 params_path=None,
                 max_seq_length=25,
                 batch_size=256,
                 sentencepiece=None,
                 root=os.path.join(get_home_dir(), 'models')):
        self.ctx = ctx
        self.dtype = dtype
        self.max_seq_length = max_seq_length
        self.batch_size = batch_size
        self.dataset_name = dataset_name

        # use sentencepiece vocab and a checkpoint
        # we need to set dataset_name to None, otherwise it uses the downloaded vocab
        if params_path and sentencepiece:
            dataset_name = None
        else:
            dataset_name = self.dataset_name
        if sentencepiece:
            vocab = gluonnlp.vocab.BERTVocab.from_sentencepiece(sentencepiece)
        else:
            vocab = None

        self.bert, self.vocab = gluonnlp.model.get_model(
            model,
            dataset_name=dataset_name,
            pretrained=params_path is None,
            ctx=self.ctx,
            use_pooler=False,
            use_decoder=False,
            use_classifier=False,
            root=root,
            vocab=vocab)

        self.bert.cast(self.dtype)
        if params_path:
            logger.info('Loading params from %s', params_path)
            self.bert.load_parameters(params_path,
                                      ctx=ctx,
                                      ignore_extra=True,
                                      cast_dtype=True)

        lower = 'uncased' in self.dataset_name
        if sentencepiece:
            self.tokenizer = BERTSPTokenizer(sentencepiece,
                                             self.vocab,
                                             lower=lower)
        else:
            self.tokenizer = BERTTokenizer(self.vocab, lower=lower)
        self.transform = BERTSentenceTransform(
            tokenizer=self.tokenizer,
            max_seq_length=self.max_seq_length,
            pair=False)

    def __call__(self, sentences, oov_way='avg'):
        return self.embedding(sentences, oov_way='avg')

    def embedding(self, sentences, oov_way='avg'):
        """
        Get tokens, tokens embedding

        Parameters
        ----------
        sentences : List[str]
            sentences for encoding.
        oov_way : str, default avg.
            use **avg**, **sum** or **last** to get token embedding for those out of
            vocabulary words

        Returns
        -------
        List[(List[str], List[ndarray])]
            List of tokens, and tokens embedding
        """
        data_iter = self.data_loader(sentences=sentences)
        batches = []
        for token_ids, valid_length, token_types in data_iter:
            token_ids = token_ids.as_in_context(self.ctx)
            valid_length = valid_length.as_in_context(self.ctx)
            token_types = token_types.as_in_context(self.ctx)
            sequence_outputs = self.bert(token_ids, token_types,
                                         valid_length.astype(self.dtype))
            for token_id, sequence_output in zip(token_ids.asnumpy(),
                                                 sequence_outputs.asnumpy()):
                batches.append((token_id, sequence_output))
        return self.oov(batches, oov_way)

    def data_loader(self, sentences, shuffle=False):
        """Load, tokenize and prepare the input sentences."""
        dataset = BertEmbeddingDataset(sentences, self.transform)
        return DataLoader(dataset=dataset,
                          batch_size=self.batch_size,
                          shuffle=shuffle)

    def oov(self, batches, oov_way='avg'):
        """
        How to handle oov. Also filter out [CLS], [SEP] tokens.

        Parameters
        ----------
        batches : List[(tokens_id, sequence_outputs)].
            batch   token_ids shape is (max_seq_length,),
                    sequence_outputs shape is (max_seq_length, dim)
        oov_way : str
            use **avg**, **sum** or **last** to get token embedding for those out of
            vocabulary words

        Returns
        -------
        List[(List[str], List[ndarray])]
            List of tokens, and tokens embedding
        """
        sentences = []
        padding_idx, cls_idx, sep_idx = None, None, None
        if self.vocab.padding_token:
            padding_idx = self.vocab[self.vocab.padding_token]
        if self.vocab.cls_token:
            cls_idx = self.vocab[self.vocab.cls_token]
        if self.vocab.sep_token:
            sep_idx = self.vocab[self.vocab.sep_token]
        for token_ids, sequence_outputs in batches:
            tokens = []
            tensors = []
            oov_len = 1
            for token_id, sequence_output in zip(token_ids, sequence_outputs):
                # [PAD] token, sequence is finished.
                if padding_idx and token_id == padding_idx:
                    break
                # [CLS], [SEP]
                if cls_idx and token_id == cls_idx:
                    continue
                if sep_idx and token_id == sep_idx:
                    continue
                token = self.vocab.idx_to_token[token_id]
                if not self.tokenizer.is_first_subword(token):
                    tokens.append(token)
                    if oov_way == 'last':
                        tensors[-1] = sequence_output
                    else:
                        tensors[-1] += sequence_output
                    if oov_way == 'avg':
                        oov_len += 1
                else:  # iv, avg last oov
                    if oov_len > 1:
                        tensors[-1] /= oov_len
                        oov_len = 1
                    tokens.append(token)
                    tensors.append(sequence_output)
            if oov_len > 1:  # if the whole sentence is one oov, handle this special case
                tensors[-1] /= oov_len
            sentences.append((tokens, tensors))
        return sentences
Ejemplo n.º 26
0
def train(args):
    ctx = mx.cpu() if args.gpu is None else mx.gpu(args.gpu)
    dataset_name = 'book_corpus_wiki_en_cased' if args.cased else 'book_corpus_wiki_en_uncased'
    bert_model, bert_vocab = nlp.model.get_model(name=args.bert_model,
                                                 dataset_name=dataset_name,
                                                 pretrained=True,
                                                 ctx=ctx,
                                                 use_pooler=True,
                                                 use_decoder=False,
                                                 use_classifier=False,
                                                 dropout=args.dropout_prob,
                                                 embed_dropout=args.dropout_prob)
    tokenizer = BERTTokenizer(bert_vocab, lower=not args.cased)
    if args.dataset == 'atis':
        train_data = ATISDataset('train')
        dev_data = ATISDataset('dev')
        test_data = ATISDataset('test')
        intent_vocab = train_data.intent_vocab
        slot_vocab = train_data.slot_vocab
    elif args.dataset == 'snips':
        train_data = SNIPSDataset('train')
        dev_data = SNIPSDataset('dev')
        test_data = SNIPSDataset('test')
        intent_vocab = train_data.intent_vocab
        slot_vocab = train_data.slot_vocab
    else:
        raise NotImplementedError
    print('Dataset {}'.format(args.dataset))
    print('   #Train/Dev/Test = {}/{}/{}'.format(len(train_data), len(dev_data), len(test_data)))
    print('   #Intent         = {}'.format(len(intent_vocab)))
    print('   #Slot           = {}'.format(len(slot_vocab)))
    # Display An Example
    print('Display A Samples')
    print_sample(test_data, 1)
    print('-' * 80)

    idsl_transform = IDSLSubwordTransform(subword_vocab=bert_vocab,
                                          subword_tokenizer=tokenizer,
                                          slot_vocab=slot_vocab,
                                          cased=args.cased)
    train_data_bert = train_data.transform(idsl_transform, lazy=False)
    dev_data_bert = dev_data.transform(idsl_transform, lazy=False)
    test_data_bert = test_data.transform(idsl_transform, lazy=False)
    # Construct the DataLoader
    batchify_fn = nlp.data.batchify.Tuple(nlp.data.batchify.Pad(),    # Subword ID
                                          nlp.data.batchify.Pad(),    # Subword Mask
                                          nlp.data.batchify.Pad(),    # Beginning of subword
                                          nlp.data.batchify.Pad(),    # Tag IDs
                                          nlp.data.batchify.Stack(),  # Intent Label
                                          nlp.data.batchify.Stack())  # Valid Length
    train_batch_sampler = nlp.data.sampler.SortedBucketSampler(
        [len(ele) for ele in train_data_bert],
        batch_size=args.batch_size,
        mult=20,
        shuffle=True)
    train_loader = gluon.data.DataLoader(dataset=train_data_bert,
                                         num_workers=4,
                                         batch_sampler=train_batch_sampler,
                                         batchify_fn=batchify_fn)
    dev_loader = gluon.data.DataLoader(dataset=dev_data_bert,
                                       num_workers=4,
                                       batch_size=args.batch_size,
                                       batchify_fn=batchify_fn,
                                       shuffle=False)
    test_loader = gluon.data.DataLoader(dataset=test_data_bert,
                                        num_workers=4,
                                        batch_size=args.batch_size,
                                        batchify_fn=batchify_fn,
                                        shuffle=False)

    # Build the network and loss functions
    intent_pred_loss = gluon.loss.SoftmaxCELoss()
    if args.use_focal:
        slot_pred_loss = SoftmaxFocalLoss(batch_axis=[0, 1],
                                          alpha=args.focal_alpha, gamma=args.focal_gamma)
    else:
        slot_pred_loss = gluon.loss.SoftmaxCELoss(batch_axis=[0, 1])

    net = BERTForICSL(bert_model, num_intent_classes=len(intent_vocab),
                      num_slot_classes=len(slot_vocab), dropout_prob=args.dropout_prob)
    net.slot_tagger.initialize(ctx=ctx, init=mx.init.Normal(0.02))
    net.intent_classifier.initialize(ctx=ctx, init=mx.init.Normal(0.02))
    net.hybridize()
    intent_pred_loss.hybridize()
    slot_pred_loss.hybridize()

    # Build the trainer
    trainer = gluon.Trainer(net.collect_params(), args.optimizer,
                            {'learning_rate': args.learning_rate, 'wd': args.wd},
                            update_on_kvstore=False)

    params = [p for p in net.collect_params().values() if p.grad_req != 'null']
    step_num = 0
    num_train_steps = int(len(train_batch_sampler) * args.epochs)
    num_warmup_steps = int(num_train_steps * args.warmup_ratio)
    best_dev_sf1 = -1
    for epoch_id in range(args.epochs):
        avg_train_intent_loss = 0.0
        avg_train_slot_loss = 0.0
        nsample = 0
        nslot = 0
        ntoken = 0
        train_epoch_start = time.time()
        for token_ids, mask, selected, slot_ids, intent_label, valid_length in train_loader:
            ntoken += valid_length.sum().asscalar()
            token_ids = mx.nd.array(token_ids, ctx=ctx).astype(np.int32)
            mask = mx.nd.array(mask, ctx=ctx).astype(np.float32)
            slot_ids = mx.nd.array(slot_ids, ctx=ctx).astype(np.int32)
            intent_label = mx.nd.array(intent_label, ctx=ctx).astype(np.int32)
            valid_length = mx.nd.array(valid_length, ctx=ctx).astype(np.float32)
            batch_nslots = mask.sum().asscalar()
            batch_nsample = token_ids.shape[0]

            # Set learning rate warm-up
            step_num += 1
            if step_num < num_warmup_steps:
                new_lr = args.learning_rate * step_num / num_warmup_steps
            else:
                offset = ((step_num - num_warmup_steps) * args.learning_rate /
                          (num_train_steps - num_warmup_steps))
                new_lr = args.learning_rate - offset
            trainer.set_learning_rate(new_lr)

            with mx.autograd.record():
                intent_scores, slot_scores = net(token_ids, valid_length)
                intent_loss = intent_pred_loss(intent_scores, intent_label)
                slot_loss = slot_pred_loss(slot_scores, slot_ids, mask.expand_dims(axis=-1))
                intent_loss = intent_loss.mean()
                slot_loss = slot_loss.sum() / batch_nslots
                loss = intent_loss + args.slot_loss_mult * slot_loss
                loss.backward()
            trainer.update(1.0)
            avg_train_intent_loss += intent_loss.asscalar() * batch_nsample
            avg_train_slot_loss += slot_loss.asscalar() * batch_nslots
            nsample += batch_nsample
            nslot += batch_nslots
        train_epoch_end = time.time()
        avg_train_intent_loss /= nsample
        avg_train_slot_loss /= nslot
        print('[Epoch {}] train intent/slot = {:.3f}/{:.3f}, #token per second={:.0f}'.format(
            epoch_id, avg_train_intent_loss, avg_train_slot_loss,
            ntoken / (train_epoch_end - train_epoch_start)))
        avg_dev_intent_loss, avg_dev_slot_loss, dev_intent_acc,\
        dev_slot_f1, dev_pred_slots, dev_gt_slots\
            = evaluation(ctx, dev_loader, net, intent_pred_loss, slot_pred_loss, slot_vocab)
        print('[Epoch {}]    dev intent/slot = {:.3f}/{:.3f}, slot f1 = {:.2f}, intent acc = {:.2f}'.format(
            epoch_id, avg_dev_intent_loss, avg_dev_slot_loss, dev_slot_f1 * 100, dev_intent_acc * 100))
        if dev_slot_f1 > best_dev_sf1:
            best_dev_sf1 = dev_slot_f1
            avg_test_intent_loss, avg_test_slot_loss, test_intent_acc, \
            test_slot_f1, test_pred_slots, test_gt_slots \
                = evaluation(ctx, test_loader, net, intent_pred_loss, slot_pred_loss, slot_vocab)
            print('[Epoch {}]    test intent/slot = {:.3f}/{:.3f}, slot f1 = {:.2f}, intent acc = {:.2f}'.format(
                epoch_id, avg_test_intent_loss, avg_test_slot_loss, test_slot_f1 * 100, test_intent_acc * 100))
            if not os.path.exists(args.save_dir):
                os.makedirs(args.save_dir)
            net.save_parameters(os.path.join(args.save_dir, 'best_valid.params'))
    print('Evaluate the best model:')
    net.load_parameters(os.path.join(args.save_dir, 'best_valid.params'))
    avg_test_intent_loss, avg_test_slot_loss, test_intent_acc, \
    test_slot_f1, test_pred_slots, test_gt_slots \
        = evaluation(ctx, test_loader, net, intent_pred_loss, slot_pred_loss, slot_vocab)
    print('Best validation model --> Slot F1={:.2f}, Intent acc={:.2f}'.format(test_slot_f1 * 100, test_intent_acc * 100))
    with open(os.path.join(args.save_dir, 'test_error.txt'), 'w') as of:
        of.write('{} {}\n'.format(test_slot_f1, test_intent_acc ))
Ejemplo n.º 27
0
def translate(args):
    gpu_idx = args.gpu
    if not gpu_idx:
        ctx = mx.cpu()
    else:
        ctx = mx.gpu(gpu_idx - 1)
    en_bert, en_vocab = gluonnlp.model.get_model(
        args.bert_model,
        dataset_name=args.en_bert_dataset,
        pretrained=True,
        ctx=ctx,
        use_pooler=False,
        use_decoder=False,
        use_classifier=False)
    _, ch_vocab = gluonnlp.model.get_model(args.bert_model,
                                           dataset_name=args.ch_bert_dataset,
                                           pretrained=True,
                                           ctx=ctx,
                                           use_pooler=False,
                                           use_decoder=False,
                                           use_classifier=False)

    mt_model = MTModel_Hybird(en_vocab=en_vocab,
                              ch_vocab=ch_vocab,
                              embedding_dim=args.mt_emb_dim,
                              model_dim=args.mt_model_dim,
                              head_num=args.mt_head_num,
                              layer_num=args.mt_layer_num,
                              ffn_dim=args.mt_ffn_dim,
                              dropout=args.mt_dropout,
                              att_dropout=args.mt_att_dropout,
                              ffn_dropout=args.mt_ffn_dropout,
                              ctx=ctx)

    en_bert.load_parameters(args.en_bert_model_params_path, ctx=ctx)
    mt_model.load_parameters(args.mt_model_params_path, ctx=ctx)

    en_bert_tokenzier = BERTTokenizer(en_vocab)
    ch_bert_tokenzier = BERTTokenizer(ch_vocab)

    while True:
        trans = input("input:")

        trans = en_bert_tokenzier(trans)
        trans = [en_vocab.cls_token] + \
            trans + [en_vocab.sep_token]

        trans_valid_len = len(trans)

        if args.max_en_len and len(trans) > args.max_en_len:
            trans = trans[0:args.max_en_len]

        aim = [BOS]

        trans = en_vocab[trans]
        aim = ch_vocab[aim]

        aim = nd.array([aim], ctx=ctx)

        trans = nd.array([trans], ctx=ctx)
        trans_valid_len = nd.array([trans_valid_len], ctx=ctx)
        trans_token_types = nd.zeros_like(trans)

        batch_size = 1
        beam_size = 6

        en_bert_outputs = en_bert(trans, trans_token_types, trans_valid_len)
        mt_outputs = mt_model(en_bert_outputs, trans, aim)

        en_bert_outputs = nd.broadcast_axes(en_bert_outputs,
                                            axis=0,
                                            size=beam_size)
        trans = nd.broadcast_axes(trans, axis=0, size=beam_size)
        targets = None
        for n in range(0, args.max_ch_len):
            aim, targets = beam_search(mt_outputs[:, n, :],
                                       targets=targets,
                                       max_seq_len=args.max_ch_len,
                                       ctx=ctx,
                                       beam_width=beam_size)
            mt_outputs = mt_model(en_bert_outputs, trans, aim)

        predict = aim.asnumpy().tolist()
        predict_strs = []
        for pred in predict:
            predict_token = [ch_vocab.idx_to_token[int(idx)] for idx in pred]
            predict_str = ""
            sub_token = []
            for token in predict_token:
                # if token in ["[CLS]", EOS, "[SEP]"]:
                #     continue
                if len(sub_token) == 0:
                    sub_token.append(token)
                elif token[:2] != "##" and len(sub_token) != 0:
                    predict_str += "".join(sub_token) + " "
                    sub_token = []
                    sub_token.append(token)
                else:
                    if token[:2] == "##":
                        token = token.replace("##", "")
                    sub_token.append(token)
                if token == EOS:
                    if len(sub_token) != 0:
                        predict_str += "".join(sub_token) + " "
                    break
            predict_strs.append(
                predict_str.replace("[SEP]", "").replace("[CLS]",
                                                         "").replace(EOS, ""))
        for predict_str in predict_strs:
            print(predict_str)
Ejemplo n.º 28
0
def translate(args):
    gpu_idx = args.gpu
    if not gpu_idx:
        ctx = mx.cpu()
    else:
        ctx = mx.gpu(gpu_idx - 1)
    src_bert, src_vocab = gluonnlp.model.get_model(args.bert_model,
                                                   dataset_name=args.src_bert_dataset,
                                                   pretrained=True,
                                                   ctx=ctx,
                                                   use_pooler=False,
                                                   use_decoder=False,
                                                   use_classifier=False)
    _, tgt_vocab = gluonnlp.model.get_model(args.bert_model,
                                            dataset_name=args.tgt_bert_dataset,
                                            pretrained=True,
                                            ctx=ctx,
                                            use_pooler=False,
                                            use_decoder=False,
                                            use_classifier=False)

    mt_model = MTModel_Hybird(src_vocab=src_vocab,
                              tgt_vocab=tgt_vocab,
                              embedding_dim=args.mt_emb_dim,
                              model_dim=args.mt_model_dim,
                              head_num=args.mt_head_num,
                              layer_num=args.mt_layer_num,
                              ffn_dim=args.mt_ffn_dim,
                              dropout=args.mt_dropout,
                              att_dropout=args.mt_att_dropout,
                              ffn_dropout=args.mt_ffn_dropout,
                              ctx=ctx)

    src_bert.load_parameters(args.bert_model_params_path, ctx=ctx)
    mt_model.load_parameters(args.mt_model_params_path, ctx=ctx)

    src_bert_tokenzier = BERTTokenizer(src_vocab)
    tgt_bert_tokenzier = BERTTokenizer(tgt_vocab)

    while True:
        src = input("input:")

        src = src_bert_tokenzier(src)
        src = [src_vocab.cls_token] + \
            src + [src_vocab.sep_token]

        src_valid_len = len(src)

        if args.max_src_len and len(src) > args.max_src_len:
            src = src[0:args.max_src_len]

        tgt = [BOS]

        src = src_vocab[src]
        tgt = tgt_vocab[tgt]

        tgt = nd.array([tgt], ctx=ctx)

        src = nd.array([src], ctx=ctx)
        src_valid_len = nd.array([src_valid_len], ctx=ctx)
        src_token_types = nd.zeros_like(src)

        beam_size = 6

        src_bert_outputs = src_bert(src, src_token_types, src_valid_len)
        mt_outputs = mt_model(src_bert_outputs, src, tgt)

        src_bert_outputs = nd.broadcast_axes(
            src_bert_outputs, axis=0, size=beam_size)
        src = nd.broadcast_axes(src, axis=0, size=beam_size)
        targets = None
        for n in range(0, args.max_tgt_len):
            tgt, targets = beam_search(
                mt_outputs[:, n, :], targets=targets, max_seq_len=args.max_tgt_len, ctx=ctx, beam_width=beam_size)
            mt_outputs = mt_model(src_bert_outputs, src, tgt)

        predict = tgt.asnumpy().tolist()
        predict_strs = []
        for pred in predict:
            predict_token = [tgt_vocab.idx_to_token[int(idx)] for idx in pred]
            predict_str = ""
            sub_token = []
            for token in predict_token:
                # if token in ["[CLS]", EOS, "[SEP]"]:
                #     continue
                if len(sub_token) == 0:
                    sub_token.append(token)
                elif token[:2] != "##" and len(sub_token) != 0:
                    predict_str += "".join(sub_token) + " "
                    sub_token = []
                    sub_token.append(token)
                else:
                    if token[:2] == "##":
                        token = token.replace("##", "")
                    sub_token.append(token)
                if token == EOS:
                    if len(sub_token) != 0:
                        predict_str += "".join(sub_token) + " "
                    break
            predict_strs.append(predict_str.replace(
                "[SEP]", "").replace("[CLS]", "").replace(EOS, ""))
        for predict_str in predict_strs:
            print(predict_str)
Ejemplo n.º 29
0

if __name__ == '__main__':
    # random seed
    seed = args.seed
    np.random.seed(seed)
    random.seed(seed)
    mx.random.seed(seed)

    ctx = [mx.cpu()] if args.gpus is None or args.gpus == '' else \
          [mx.gpu(int(x)) for x in args.gpus.split(',')]

    model, nsp_loss, mlm_loss, vocabulary = get_model(ctx)

    lower = 'uncased' in args.dataset_name
    tokenizer = BERTTokenizer(vocabulary, lower=lower)
    store = mx.kv.create(args.kvstore)

    if args.ckpt_dir:
        ckpt_dir = os.path.expanduser(args.ckpt_dir)
        if not os.path.exists(ckpt_dir):
            os.makedirs(ckpt_dir)

    if args.data:
        data_train = get_dataset(args.data, args.batch_size, len(ctx), True,
                                 store)
        train(data_train, model, nsp_loss, mlm_loss, len(tokenizer.vocab), ctx,
              store)
    if args.data_eval:
        data_eval = get_dataset(args.data_eval, args.batch_size_eval, len(ctx),
                                False, store)
 def __init__(self, ch_vocab=None, max_seq_len=None, istrain=True):
     self.ch_vocab = ch_vocab
     self.max_seq_len = max_seq_len
     self.istrain = istrain
     self.tokenizer = BERTTokenizer(
         ch_vocab)  # 后面没用bert的tokenizer,感觉效果反而好些。