Example #1
0
    def __init__(self,
                 sentencepiece_model_file,
                 do_lower_case=True,
                 encoding="utf8",
                 unk_token="<unk>",
                 sep_token="[SEP]",
                 pad_token="[PAD]",
                 cls_token="[CLS]",
                 mask_token="[MASK]"):

        if not os.path.isfile(sentencepiece_model_file):
            raise ValueError(
                "Can't find a vocabulary file at path '{}'. To load the "
                "vocabulary from a pretrained model please use "
                "`tokenizer = BigBirdTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
                .format(sentencepiece_model_file))
        self.encoding = encoding
        mod = try_import('sentencepiece')
        self.sp_model = mod.SentencePieceProcessor()
        if os.path.isfile(sentencepiece_model_file):
            self.sp_model.Load(sentencepiece_model_file)
        vocab_dict = {}
        for id in range(self.sp_model.get_piece_size()):
            vocab_dict[self.sp_model.id_to_piece(id)] = id
        self.vocab = Vocab.from_dict(vocab_dict, unk_token=unk_token)
        self.start_word_tokens = np.array([
            self.vocab._idx_to_token[i][0] == "▁"
            for i in range(0, len(self.vocab))
        ])
        self.unk_token = unk_token
        self.mask_id = vocab_dict[mask_token]
        self.unk_id = vocab_dict[unk_token]
        self.cls_id = vocab_dict[cls_token]
        self.sep_id = vocab_dict[sep_token]
Example #2
0
    def __init__(self,
                 sentencepiece_model_file,
                 do_lower_case=True,
                 encoding="utf8",
                 unk_token="<unk>",
                 sep_token="[SEP]",
                 pad_token="[PAD]",
                 cls_token="[CLS]",
                 mask_token="[MASK]",
                 **kwargs):

        if not os.path.isfile(sentencepiece_model_file):
            raise ValueError(
                "Can't find a vocabulary file at path '{}'. To load the "
                "vocabulary from a pretrained model please use "
                "`tokenizer = BigBirdTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
                .format(sentencepiece_model_file))
        self.encoding = encoding
        mod = try_import('sentencepiece')
        self.sp_model = mod.SentencePieceProcessor()
        if os.path.isfile(sentencepiece_model_file):
            self.sp_model.Load(sentencepiece_model_file)
        vocab_dict = {}
        for id in range(self.sp_model.get_piece_size()):
            vocab_dict[self.sp_model.id_to_piece(id)] = id
        self.vocab = Vocab.from_dict(vocab_dict, unk_token=unk_token)
        self.start_word_tokens = np.array([
            self.vocab._idx_to_token[i][0] == "▁"
            for i in range(0, len(self.vocab))
        ])
        self.unk_token = unk_token
        self.mask_id = vocab_dict[mask_token]
        self.unk_id = vocab_dict[unk_token]
        self.cls_id = vocab_dict[cls_token]
        self.sep_id = vocab_dict[sep_token]
        self.pad_id = vocab_dict[pad_token] if pad_token in vocab_dict else 0

        unk_token = AddedToken(unk_token,
                               lstrip=False, rstrip=False) if isinstance(
                                   unk_token, str) else unk_token
        pad_token = AddedToken(pad_token,
                               lstrip=False, rstrip=False) if isinstance(
                                   pad_token, str) else pad_token
        cls_token = AddedToken(cls_token,
                               lstrip=False, rstrip=False) if isinstance(
                                   cls_token, str) else cls_token
        sep_token = AddedToken(sep_token,
                               lstrip=False, rstrip=False) if isinstance(
                                   sep_token, str) else sep_token

        # Mask token behave like a normal word, i.e. include the space before it
        mask_token = AddedToken(mask_token,
                                lstrip=True, rstrip=False) if isinstance(
                                    mask_token, str) else mask_token

        self._build_special_tokens_map_extended(sep_token=sep_token,
                                                cls_token=cls_token,
                                                unk_token=unk_token,
                                                pad_token=pad_token,
                                                mask_token=mask_token)
Example #3
0
    def __init__(self,
                 embedding_name=EMBEDDING_NAME_LIST[0],
                 unknown_token=UNK_TOKEN,
                 unknown_token_vector=None,
                 extended_vocab_path=None,
                 trainable=True,
                 keep_extended_vocab_only=False):
        vector_path = osp.join(EMBEDDING_HOME, embedding_name + ".npz")
        if not osp.exists(vector_path):
            # download
            url = EMBEDDING_URL_ROOT + "/" + embedding_name + ".tar.gz"
            get_path_from_url(url, EMBEDDING_HOME)

        logger.info("Loading token embedding...")
        vector_np = np.load(vector_path)
        self.embedding_dim = vector_np['embedding'].shape[1]
        self.unknown_token = unknown_token
        if unknown_token_vector is not None:
            unk_vector = np.array(unknown_token_vector).astype(
                paddle.get_default_dtype())
        else:
            unk_vector = np.random.normal(scale=0.02,
                                          size=self.embedding_dim).astype(
                                              paddle.get_default_dtype())
        pad_vector = np.array([0] * self.embedding_dim).astype(
            paddle.get_default_dtype())
        if extended_vocab_path is not None:
            embedding_table = self._extend_vocab(extended_vocab_path,
                                                 vector_np, pad_vector,
                                                 unk_vector,
                                                 keep_extended_vocab_only)
            trainable = True
        else:
            embedding_table = self._init_without_extend_vocab(
                vector_np, pad_vector, unk_vector)

        self.vocab = Vocab.from_dict(self._word_to_idx,
                                     unk_token=unknown_token,
                                     pad_token=PAD_TOKEN)
        self.num_embeddings = embedding_table.shape[0]
        # import embedding
        super(TokenEmbedding,
              self).__init__(self.num_embeddings,
                             self.embedding_dim,
                             padding_idx=self._word_to_idx[PAD_TOKEN])
        self.weight.set_value(embedding_table)
        self.set_trainable(trainable)
        logger.info("Finish loading embedding vector.")
        s = "Token Embedding info:\
             \nUnknown index: {}\
             \nUnknown token: {}\
             \nPadding index: {}\
             \nPadding token: {}\
             \nShape :{}".format(self._word_to_idx[self.unknown_token],
                                 self.unknown_token,
                                 self._word_to_idx[PAD_TOKEN], PAD_TOKEN,
                                 self.weight.shape)
        logger.info(s)
Example #4
0
    if '[PAD]' not in vocab:
        vocab['[PAD]'] = len(vocab)
    # Loads dataset.
    train_ds, dev_ds = load_dataset("chnsenticorp", splits=["train", "dev"])

    # Constructs the newtork.
    model = BoWModel(vocab_size=len(vocab),
                     num_classes=len(train_ds.label_list),
                     vocab_path=vocab_path,
                     use_token_embedding=args.use_token_embedding)
    if args.use_token_embedding:
        vocab = model.embedder.vocab
        data.set_tokenizer(vocab)
        vocab = vocab.token_to_idx
    else:
        v = Vocab.from_dict(vocab, unk_token="[UNK]", pad_token="[PAD]")
        data.set_tokenizer(v)
    model = paddle.Model(model)

    # Reads data and generates mini-batches.
    trans_fn = partial(data.convert_example,
                       vocab=vocab,
                       unk_token_id=vocab['[UNK]'],
                       is_test=False)
    train_loader = create_dataloader(train_ds,
                                     trans_fn=trans_fn,
                                     batch_size=args.batch_size,
                                     mode='train',
                                     pad_token_id=vocab['[PAD]'])
    dev_loader = create_dataloader(dev_ds,
                                   trans_fn=trans_fn,
Example #5
0
    # Loads dataset.
    train_ds, dev_ds = load_dataset("chnsenticorp", splits=["train", "dev"])
    texts = []
    for data in train_ds:
        texts.append(data["text"])
    for data in dev_ds:
        texts.append(data["text"])

    # Reads stop words.
    # Stopwords are just for example. 
    # It should be updated according to the corpus.
    stopwords = set(["的", "吗", "吧", "呀", "呜", "呢", "呗"])
    # Builds vocab.
    word2idx = build_vocab(
        texts, stopwords, min_freq=5, unk_token="[UNK]", pad_token="[PAD]")
    vocab = Vocab.from_dict(word2idx, unk_token="[UNK]", pad_token="[PAD]")
    # Saves vocab.
    vocab.to_json(args.vocab_path)

    # Constructs the network.
    network = args.network.lower()
    vocab_size = len(vocab)
    num_classes = len(train_ds.label_list)
    pad_token_id = vocab.to_indices('[PAD]')
    if network == 'bow':
        model = BoWModel(vocab_size, num_classes, padding_idx=pad_token_id)
    elif network == 'bigru':
        model = GRUModel(
            vocab_size,
            num_classes,
            direction='bidirect',