def __init__(self, sentencepiece_model_file, do_lower_case=True, encoding="utf8", unk_token="<unk>", sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]", mask_token="[MASK]"): if not os.path.isfile(sentencepiece_model_file): raise ValueError( "Can't find a vocabulary file at path '{}'. To load the " "vocabulary from a pretrained model please use " "`tokenizer = BigBirdTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`" .format(sentencepiece_model_file)) self.encoding = encoding mod = try_import('sentencepiece') self.sp_model = mod.SentencePieceProcessor() if os.path.isfile(sentencepiece_model_file): self.sp_model.Load(sentencepiece_model_file) vocab_dict = {} for id in range(self.sp_model.get_piece_size()): vocab_dict[self.sp_model.id_to_piece(id)] = id self.vocab = Vocab.from_dict(vocab_dict, unk_token=unk_token) self.start_word_tokens = np.array([ self.vocab._idx_to_token[i][0] == "▁" for i in range(0, len(self.vocab)) ]) self.unk_token = unk_token self.mask_id = vocab_dict[mask_token] self.unk_id = vocab_dict[unk_token] self.cls_id = vocab_dict[cls_token] self.sep_id = vocab_dict[sep_token]
def __init__(self, sentencepiece_model_file, do_lower_case=True, encoding="utf8", unk_token="<unk>", sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]", mask_token="[MASK]", **kwargs): if not os.path.isfile(sentencepiece_model_file): raise ValueError( "Can't find a vocabulary file at path '{}'. To load the " "vocabulary from a pretrained model please use " "`tokenizer = BigBirdTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`" .format(sentencepiece_model_file)) self.encoding = encoding mod = try_import('sentencepiece') self.sp_model = mod.SentencePieceProcessor() if os.path.isfile(sentencepiece_model_file): self.sp_model.Load(sentencepiece_model_file) vocab_dict = {} for id in range(self.sp_model.get_piece_size()): vocab_dict[self.sp_model.id_to_piece(id)] = id self.vocab = Vocab.from_dict(vocab_dict, unk_token=unk_token) self.start_word_tokens = np.array([ self.vocab._idx_to_token[i][0] == "▁" for i in range(0, len(self.vocab)) ]) self.unk_token = unk_token self.mask_id = vocab_dict[mask_token] self.unk_id = vocab_dict[unk_token] self.cls_id = vocab_dict[cls_token] self.sep_id = vocab_dict[sep_token] self.pad_id = vocab_dict[pad_token] if pad_token in vocab_dict else 0 unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance( unk_token, str) else unk_token pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance( pad_token, str) else pad_token cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance( cls_token, str) else cls_token sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance( sep_token, str) else sep_token # Mask token behave like a normal word, i.e. include the space before it mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance( mask_token, str) else mask_token self._build_special_tokens_map_extended(sep_token=sep_token, cls_token=cls_token, unk_token=unk_token, pad_token=pad_token, mask_token=mask_token)
def __init__(self, embedding_name=EMBEDDING_NAME_LIST[0], unknown_token=UNK_TOKEN, unknown_token_vector=None, extended_vocab_path=None, trainable=True, keep_extended_vocab_only=False): vector_path = osp.join(EMBEDDING_HOME, embedding_name + ".npz") if not osp.exists(vector_path): # download url = EMBEDDING_URL_ROOT + "/" + embedding_name + ".tar.gz" get_path_from_url(url, EMBEDDING_HOME) logger.info("Loading token embedding...") vector_np = np.load(vector_path) self.embedding_dim = vector_np['embedding'].shape[1] self.unknown_token = unknown_token if unknown_token_vector is not None: unk_vector = np.array(unknown_token_vector).astype( paddle.get_default_dtype()) else: unk_vector = np.random.normal(scale=0.02, size=self.embedding_dim).astype( paddle.get_default_dtype()) pad_vector = np.array([0] * self.embedding_dim).astype( paddle.get_default_dtype()) if extended_vocab_path is not None: embedding_table = self._extend_vocab(extended_vocab_path, vector_np, pad_vector, unk_vector, keep_extended_vocab_only) trainable = True else: embedding_table = self._init_without_extend_vocab( vector_np, pad_vector, unk_vector) self.vocab = Vocab.from_dict(self._word_to_idx, unk_token=unknown_token, pad_token=PAD_TOKEN) self.num_embeddings = embedding_table.shape[0] # import embedding super(TokenEmbedding, self).__init__(self.num_embeddings, self.embedding_dim, padding_idx=self._word_to_idx[PAD_TOKEN]) self.weight.set_value(embedding_table) self.set_trainable(trainable) logger.info("Finish loading embedding vector.") s = "Token Embedding info:\ \nUnknown index: {}\ \nUnknown token: {}\ \nPadding index: {}\ \nPadding token: {}\ \nShape :{}".format(self._word_to_idx[self.unknown_token], self.unknown_token, self._word_to_idx[PAD_TOKEN], PAD_TOKEN, self.weight.shape) logger.info(s)
if '[PAD]' not in vocab: vocab['[PAD]'] = len(vocab) # Loads dataset. train_ds, dev_ds = load_dataset("chnsenticorp", splits=["train", "dev"]) # Constructs the newtork. model = BoWModel(vocab_size=len(vocab), num_classes=len(train_ds.label_list), vocab_path=vocab_path, use_token_embedding=args.use_token_embedding) if args.use_token_embedding: vocab = model.embedder.vocab data.set_tokenizer(vocab) vocab = vocab.token_to_idx else: v = Vocab.from_dict(vocab, unk_token="[UNK]", pad_token="[PAD]") data.set_tokenizer(v) model = paddle.Model(model) # Reads data and generates mini-batches. trans_fn = partial(data.convert_example, vocab=vocab, unk_token_id=vocab['[UNK]'], is_test=False) train_loader = create_dataloader(train_ds, trans_fn=trans_fn, batch_size=args.batch_size, mode='train', pad_token_id=vocab['[PAD]']) dev_loader = create_dataloader(dev_ds, trans_fn=trans_fn,
# Loads dataset. train_ds, dev_ds = load_dataset("chnsenticorp", splits=["train", "dev"]) texts = [] for data in train_ds: texts.append(data["text"]) for data in dev_ds: texts.append(data["text"]) # Reads stop words. # Stopwords are just for example. # It should be updated according to the corpus. stopwords = set(["的", "吗", "吧", "呀", "呜", "呢", "呗"]) # Builds vocab. word2idx = build_vocab( texts, stopwords, min_freq=5, unk_token="[UNK]", pad_token="[PAD]") vocab = Vocab.from_dict(word2idx, unk_token="[UNK]", pad_token="[PAD]") # Saves vocab. vocab.to_json(args.vocab_path) # Constructs the network. network = args.network.lower() vocab_size = len(vocab) num_classes = len(train_ds.label_list) pad_token_id = vocab.to_indices('[PAD]') if network == 'bow': model = BoWModel(vocab_size, num_classes, padding_idx=pad_token_id) elif network == 'bigru': model = GRUModel( vocab_size, num_classes, direction='bidirect',