def calculate_nll(model, test_loader, sp: spm.SentencePieceProcessor, use_cuda=True, logger_fn=None): pad_id = sp.PieceToId("[PAD]") n_examples = 0 test_nll = 0.0 with tqdm.tqdm(test_loader, desc="Test (NLL)") as pbar: for X, Y, X_lengths, Y_lengths in pbar: B, L = X.shape if use_cuda: X, Y = X.cuda(), Y.cuda() # B, L X_lengths, Y_lengths = X_lengths.cuda(), Y_lengths.cuda() pred_y = model(X, Y[:, :-1].to(X.device), X_lengths, Y_lengths) B, X, D = pred_y.shape loss = F.cross_entropy(pred_y.reshape(B * X, D), Y[:, 1:].reshape(B * X), ignore_index=pad_id, reduction="sum") n_examples += B test_nll += loss.item() metric_dict = { "test_nll": loss.item() / B, "test_nll_avg": test_nll / n_examples } if logger_fn is not None: logger_fn(metric_dict) pbar.set_postfix(metric_dict) return test_nll / n_examples
def realign_answer_span(features: Features, answer_set: Optional[Set[Text]], processor: spm.SentencePieceProcessor, span: AnswerSpan) -> Optional[AnswerSpan]: """Align answer span to text with given tokens.""" i = bisect.bisect_left(features.token_offsets, span.begin) if i == len( features.token_offsets) or span.begin < features.token_offsets[i]: i -= 1 j = i + 1 answer_end = span.begin + len(span.text.encode('utf-8')) while (j < len(features.token_offsets) and features.token_offsets[j] < answer_end): j += 1 j -= 1 sp_answer = ( features.context[features.token_offsets[i]:features.token_offsets[j + 1]] if j + 1 < len(features.token_offsets) else features.context[features.token_offsets[i]:]) if (processor.IdToPiece(features.token_ids[i]).startswith('▁') and features.token_offsets[i] > 0): sp_answer = sp_answer[1:] sp_answer = evaluation.normalize_answer(sp_answer.decode('utf-8')) if answer_set is not None and sp_answer not in answer_set: # No need to warn if the cause was breaking word boundaries. if len(sp_answer) and not len(sp_answer) > len( evaluation.normalize_answer(span.text)): logging.warning('%s: "%s" not in %s.', features.question_id, sp_answer, answer_set) return None return AnswerSpan(begin=i, end=j, text=span.text)
class SentencePieceTokenizer(Tokenizer, CppProcessorMixin): """Sentence piece tokenizer.""" class Config(ConfigBase): sp_model_path: str = "" def __init__(self, sp_model_path: str = ""): self.sp_model_path = sp_model_path self._load_processor() log_class_usage(__class__) @classmethod def from_config(cls, config: Config): return cls(config.sp_model_path) def tokenize(self, input_str: str) -> List[Token]: pieces = self.processor.EncodeAsPieces(input_str) tokens = [] # calculate start and end indices of each piece. end = 0 for piece in pieces: original_piece = piece.lstrip("\u2581") start = input_str.find(original_piece, end) end = start + len(original_piece) tokens.append(Token(piece, start, end)) return tokens def _load_processor(self): self.processor = SentencePieceProcessor() self.processor.Load(PathManager.get_local_path(self.sp_model_path)) def torchscriptify(self): return ScriptDoNothingTokenizer()
def sentence2vector(sentence, model_sentence_piece: spm.SentencePieceProcessor, dict_token2vector: dict, dict_is_valid: dict) -> [np.array, bool]: """ 文をベクトルに変換 戻り値の第2引数は、意味を獲得できたベクトルが1つもないときで分岐 - True: 意味を獲得できたtokenがあった - False: なかった :param sentence: str, 対象の文 :param model_sentence_piece: spm.SentencePieceProcessor, sentencepieceモデル :param dict_token2vector: dict, token から拡張単語分散表現を得る辞書 :param dict_is_valid: dict, token からモデルに含まれるかTrueで返す :return: """ tokens_raw = model_sentence_piece.EncodeAsPieces(sentence) tokens = leave_valid(tokens=tokens_raw, dict_is_valid=dict_is_valid) vector_size = len(list(dict_token2vector.values())[0]) # 意味を獲得できたtokenが1つもないときは、原点に集めるため、zerosで初期化 vector = np.zeros((vector_size, max(len(tokens), 1)), dtype=np.float64) for i, token in enumerate(tokens): vector[:, i] = dict_token2vector[token] # いちいちifで分岐しなくてもvector自体は正しく返せるけど、エラーメッセージ表示用に分ける # ## exceptionを発行して止めるかどうかが悩ましい if len(tokens) != 0: # valid token exists return [vector.mean(axis=1), len(tokens) != 0] else: # no valid token # assert len(tokens) != 0, "no valid token, change phrase or word" print( f" this sentence has no valid token, change phrase or word\n{''.join(tokens_raw[:20]).replace('_', '')}" ) return [vector.mean(axis=1), len(tokens) != 0]
class SentencePieceTokenizer():#TODO: pass the special tokens symbol to sp "Spacy tokenizer for `lang`" def __init__(self, lang='en', special_toks=None, sp_model=None, vocab_sz=None, max_vocab_sz=30000, model_type='unigram', char_coverage=None, cache_dir='tmp'): try: from sentencepiece import SentencePieceTrainer,SentencePieceProcessor except ImportError: raise Exception('sentencepiece module is missing: run `pip install sentencepiece`') self.sp_model,self.cache_dir = sp_model,Path(cache_dir) self.vocab_sz,self.max_vocab_sz,self.model_type = vocab_sz,max_vocab_sz,model_type self.char_coverage = ifnone(char_coverage, 0.99999 if lang in eu_langs else 0.9998) self.special_toks = ifnone(special_toks, defaults.text_spec_tok) if sp_model is None: self.tok = None else: self.tok = SentencePieceProcessor() self.tok.Load(str(sp_model)) os.makedirs(self.cache_dir, exist_ok=True) def _get_vocab_sz(self, raw_text_path): cnt = Counter() with open(raw_text_path, 'r') as f: for line in f.readlines(): cnt.update(line.split()) if len(cnt)//4 > self.max_vocab_sz: return self.max_vocab_sz res = len(cnt)//4 while res%8 != 0: res+=1 return res def train(self, raw_text_path): "Train a sentencepiece tokenizer on `texts` and save it in `path/tmp_dir`"
def calculate_nll( model, test_loader, sp: spm.SentencePieceProcessor, use_cuda=True, logger_fn=None ): with Timer() as t: pad_id = sp.PieceToId("[PAD]") n_examples = 0 test_nll = 0. pbar = tqdm.tqdm(test_loader, desc="test") for X, Y, X_lengths, Y_lengths in pbar: B, L = X.shape if use_cuda: X, Y = X.cuda(), Y.cuda() # B, L X_lengths, Y_lengths = X_lengths.cuda(), Y_lengths.cuda() pred_y = model(X, Y[:, :-1].to(X.device), X_lengths, Y_lengths) B, X, D = pred_y.shape loss = F.cross_entropy(pred_y.reshape(B * X, D), Y[:, 1:].reshape(B * X), ignore_index=pad_id, reduction='sum') n_examples += B test_nll += loss.item() if logger_fn is not None: logger_fn({'test_nll': loss.item() / B, 'test_nll_avg': test_nll / n_examples}) return test_nll / n_examples
class SentencePieceExtractor: """ Extractor implementation for SentencePiece trained models. https://github.com/google/sentencepiece """ def __init__(self, model: str): # Get SentencePiece self.sp = SentencePieceProcessor() self.sp.Load(model) def extract(self) -> Tuple[Dict[str, int], List[Tuple]]: sp = self.sp vocab = { sp.id_to_piece(index): index for index in trange(sp.GetPieceSize()) } # Merges merges = [] for piece_l in tqdm(vocab.keys(), total=sp.GetPieceSize()): for piece_r in vocab.keys(): if piece_l != piece_r: merge = sp.PieceToId(f"{piece_l}{piece_r}") score = sp.GetScore(merge) if score != 0.: merges += [(piece_l, piece_r)] return vocab, merges
def __init__(self, lang='en', special_toks=None, sp_model=None, vocab_sz=None, max_vocab_sz=30000, model_type='unigram', char_coverage=None, cache_dir='tmp'): try: from sentencepiece import SentencePieceTrainer, SentencePieceProcessor except ImportError: raise Exception('sentencepiece module is missing: run `pip install sentencepiece!=0.1.90,!=0.1.91`') self.sp_model, self.cache_dir = sp_model, Path(cache_dir) self.vocab_sz, self.max_vocab_sz, self.model_type = vocab_sz, max_vocab_sz, model_type self.char_coverage = ifnone(char_coverage, 0.99999 if lang in eu_langs else 0.9998) self.special_toks = ifnone(special_toks, defaults.text_spec_tok) if sp_model is None: self.tok = None else: self.tok = SentencePieceProcessor() self.tok.Load(str(sp_model)) os.makedirs(self.cache_dir, exist_ok=True)
class SentencePieceExtractor: """ Extractor implementation for SentencePiece trained models. https://github.com/google/sentencepiece """ def __init__(self, model: str): requires_backends(self, "sentencepiece") from sentencepiece import SentencePieceProcessor self.sp = SentencePieceProcessor() self.sp.Load(model) def extract(self) -> Tuple[Dict[str, int], List[Tuple]]: sp = self.sp vocab = { sp.id_to_piece(index): index for index in range(sp.GetPieceSize()) } # Merges merges = [] for piece_l in vocab.keys(): for piece_r in vocab.keys(): merge = f"{piece_l}{piece_r}" piece_id = vocab.get(merge, None) if piece_id: merges += [(piece_l, piece_r, piece_id)] merges = sorted(merges, key=lambda val: val[2]) merges = [(val[0], val[1]) for val in merges] return vocab, merges
class SentencePieceTokenizer: def __init__(self, spm_file, do_lower_case=True): self.processor = SentencePieceProcessor() self.processor.Load(spm_file) self.do_lower_case = do_lower_case def tokenize(self, text): text = preprocess_text(text, lower=self.do_lower_case) pieces = encode_pieces(self.processor, text, sample=False) return pieces def convert_tokens_to_ids(self, tokens): return [self.processor.PieceToId(piece) for piece in tokens] def convert_ids_to_tokens(self, ids): pieces = [self.processor.IdToPiece(_id) for _id in ids] return pieces
def main() -> int: train_config = get_train_config() model_config = ConveRTModelConfig() logger = logger_setup(train_config.log_dir) device = torch.device(train_config.device if torch.cuda.is_available() else "cpu") tokenizer = SentencePieceProcessor() tokenizer.Load(train_config.sp_model_path) instance_load_fn = load_instances_from_reddit_dataset if train_config.is_reddit else load_instances_from_tsv_dataset train_instances = instance_load_fn(train_config.train_dataset_path) test_instances = instance_load_fn(train_config.test_dataset_path) train_dataset = ConveRTDataset(train_instances, tokenizer) test_dataset = ConveRTDataset(test_instances, tokenizer) train_dataloader = DataLoader( train_dataset, train_config.train_batch_size, collate_fn=convert_collate_fn, drop_last=True ) test_dataloader = DataLoader( test_dataset, train_config.test_batch_size, collate_fn=convert_collate_fn, drop_last=True ) model = ConveRTDualEncoder(model_config) criterion = ConveRTCosineLoss(split_size=train_config.split_size) model.to(device) criterion.to(device) if train_config.use_data_paraller and torch.cuda.is_available(): model = nn.DataParallel(model) criterion = nn.DataParallel(criterion) trainer = ConveRTTrainer( model=model, criterion=criterion, train_config=train_config, train_dataloader=train_dataloader, test_dataloader=test_dataloader, logger=logger, device=device, ) trainer.train() torch.save(trainer.model, 'final_model.pkl') return 0
def _log_sample_data(model_dir: str, sp: spm.SentencePieceProcessor): training_data_path = Path(model_dir) / const.TRAINING_DATA if not training_data_path.is_file(): logging.info("Training data not found for SP sampling") return with open(training_data_path) as fin: sample = fin.readline().strip() logging.info(f"Tokenizer model vocabulary size: {len(sp)} tokens") logging.info( "Mapping first line of training data\n\n{}\n ---- sample tokens mapped to pieces ---- > \n{}\n" .format(repr(sample), ", ".join(sp.SampleEncodeAsPieces(sample, -1, 0.1)))) logging.info( "Mapping first line of training data\n\n{}\n ---- sample tokens mapped to int ---- > \n{}\n" .format(repr(sample), ", ".join([str(idx) for idx in sp.EncodeAsIds(sample)])))
def load_sentencepiece_tokenizer( tokenizer_path: str) -> SentencePieceProcessor: ''' Loads an already pretrained sentencepiece tokenizer. Args: tokenizer_path: path to the files of the pretrained sentencepiece tokenizer. Returns: tokenizer: pretrained sentencepiece tokenizer. ''' if not os.path.isfile(tokenizer_path): print("SentencePiece tokenizer not found!") sys.exit() tokenizer = SentencePieceProcessor() tokenizer.Load(tokenizer_path) # enable inserting <s> and </s> tags automatically at start/end of a sentence. tokenizer.set_encode_extra_options('bos:eos') return tokenizer
class SentencePieceTokenizer(Tokenizer, CppProcessorMixin): """Sentence piece tokenizer.""" class Config(ConfigBase): sp_model_path: str = "" max_input_text_length: Optional[int] = None use_fb_sentencepiece: Optional[bool] = False def __init__( self, sp_model_path: str = "", max_input_text_length: Optional[int] = None, use_fb_sentencepiece: Optional[bool] = None, ): self.sp_model_path = sp_model_path self.max_input_text_length = max_input_text_length self.use_fb_sentencepiece = use_fb_sentencepiece self._load_processor() log_class_usage(__class__) @classmethod def from_config(cls, config: Config): return cls( config.sp_model_path, config.max_input_text_length, config.use_fb_sentencepiece, ) def tokenize(self, input_str: str) -> List[Token]: if ( hasattr(self, "max_input_text_length") and self.max_input_text_length is not None ): input_str = input_str[: self.max_input_text_length] pieces = self.processor.EncodeAsPieces(input_str) tokens = [] # calculate start and end indices of each piece. end = 0 for piece in pieces: original_piece = piece.lstrip("\u2581") start = input_str.find(original_piece, end) end = start + len(original_piece) tokens.append(Token(piece, start, end)) return tokens def _load_processor(self): sp_model_path = PathManager.get_local_path(self.sp_model_path) if self.use_fb_sentencepiece: self.processor = torch.classes.fb.SentencePiece.fromFile(sp_model_path) else: from sentencepiece import SentencePieceProcessor self.processor = SentencePieceProcessor() self.processor.Load(sp_model_path) def torchscriptify(self): return ScriptDoNothingTokenizer()
def encode_comment(sp_model: sentencepiece.SentencePieceProcessor, comment: str, max_len=None) -> List[int]: """ Encode one comment with sentencepiece model. """ # TODO we can do sub-word augmentation here start = sp_model.PieceToId('<s>') end = sp_model.PieceToId('</s>') eol = sp_model.PieceToId(EOL) encoded = [start] for i, line in enumerate(comment.split('\n')): if i: encoded.append(eol) encoded.extend(sp_model.EncodeAsIds(line)) encoded.append(end) if max_len is not None: encoded = encoded[:max_len] return encoded
def __init__(self, spm_model_file: str, fasttext_model_file: str = '', max_pieces: int = -1): super().__init__(max_pieces=max_pieces) self.spm = SentencePieceProcessor() self.spm.Load(spm_model_file) self.pad_idx = self.spm.pad_id() self.pad_token = self.spm.IdToPiece(self.pad_idx) self.unk_idx = self.spm.unk_id() self.unk_token = self.spm.IdToPiece(self.unk_idx) self.bos_idx = self.spm.bos_id() self.bos_token = self.spm.IdToPiece(self.bos_idx) self.eos_idx = self.spm.eos_id() self.eos_token = self.spm.IdToPiece(self.eos_idx) self.fasttext_model_file = fasttext_model_file self._fasttext = None
def main(**kwargs): set_seed(1) train_config = ConveRTTrainConfig() model_config = ConveRTModelConfig() tokenizer = SentencePieceProcessor() args = _parse_args() tokenizer.Load(train_config.sp_model_path) train_instances = load_instances_from_reddit_json(train_config.dataset_path) RD = RedditData(train_instances, tokenizer, 60) dm = DataModule() train_loader = dm.train_dataloader(RD) model = SingleContextConvert(model_config, train_config) lr_decay = LearningRateDecayCallback(train_config) model.register_subword_params() trainer = ( pl.Trainer.from_argparse_args(args, callbacks = [lr_decay],**kwargs) ) # ,checkpoint_callback = checkpoint_callback) # ,resume_from_checkpoint=) trainer.fit(model, train_dataloader = train_loader, val_dataloaders = train_loader)
def test_sentencepiece(nlp, text: str, tokens, spiece: spm.SentencePieceProcessor): doc = nlp(text) assert doc.text == text.replace(" ", " ").replace("\t", " ").strip() for token, expected in zip(doc, tokens): assert token.text == expected # doc test pieces_ = spiece.encode_as_pieces(text) assert doc._.get(EXTS.pieces_) == pieces_
def __init__(self, args, src_dict, tgt_dict): super().__init__(args, src_dict, tgt_dict) spm = None if args.spm_model: from sentencepiece import SentencePieceProcessor spm = SentencePieceProcessor(model_file=args.spm_model) self.spm = spm self.spm_nbest = args.spm_nbest
def __init__(self, lang='en', special_toks=None, sp_model=None, vocab_sz=None, max_vocab_sz=30000, model_type='unigram', char_coverage=None, cache_dir='tmp'): self.sp_model, self.cache_dir = sp_model, Path(cache_dir) self.vocab_sz, self.max_vocab_sz, self.model_type = vocab_sz, max_vocab_sz, model_type self.char_coverage = ifnone(char_coverage, 0.99999 if lang in eu_langs else 0.9998) self.special_toks = ifnone(special_toks, defaults.text_spec_tok) if sp_model is None: self.tok = None else: self.tok = SentencePieceProcessor() self.tok.Load(str(sp_model)) os.makedirs(self.cache_dir, exist_ok=True)
class SubwordTokenizer(Tokenizer): def __init__(self, model_path: str = None, nbest_size: int = None, alpha: float = None): self._model_path = cached_path(model_path) self._processor = SentencePieceProcessor() self._processor.Load(self._model_path) self._nbest_size = nbest_size self._alpha = alpha def tokenize(self, text: str) -> List[Token]: if self._nbest_size and self._alpha: subwords = self._processor.SampleEncodeAsPieces(text, self._nbest_size, self._alpha) else: subwords = self._processor.EncodeAsPieces(text) tokens = [Token(s) for s in subwords] return tokens def batch_tokenize(self, texts: List[str]) -> List[List[Token]]: return [self.tokenize(text) for text in texts]
def greedy_decode(model, X, sp: spm.SentencePieceProcessor, max_decode_len=20, sample=True): start_token = sp.PieceToId("<s>") pad_token = sp.PieceToId("<pad>") B = X.size(0) model.eval() with torch.no_grad(): decoded_batch = torch.zeros((B, 1), device=X.device).long() decoded_batch[:, 0] = start_token for t in range(max_decode_len): logits = model(X, decoded_batch) _, topi = logits[:, -1, :].topk(1) decoded_batch = torch.cat((decoded_batch, topi.view(-1, 1)), -1) Y_hat = decoded_batch.cpu().numpy() Y_hat_str = ids_to_strs(Y_hat, sp) model.train() return Y_hat_str
def _load_processor(self): if getattr(self, "use_fb_sentencepiece", None): try: import importlib.resources import sentencepiece_model with importlib.resources.path(sentencepiece_model, "model") as sp_model_path: self.processor = torch.classes.fb.SentencePiece.fromFile( str(sp_model_path)) except Exception: sp_model_path = PathManager.get_local_path(self.sp_model_path) self.processor = torch.classes.fb.SentencePiece.fromFile( sp_model_path) else: from sentencepiece import SentencePieceProcessor sp_model_path = PathManager.get_local_path(self.sp_model_path) self.processor = SentencePieceProcessor() self.processor.Load(sp_model_path)
def save_pieces( sp: spm.SentencePieceProcessor, num_pieces: int, data_dir: Path, output_prefix: str, vocab: set, ) -> None: """Saves word pieces to disk.""" logger.info(f"Generating word piece list of size {num_pieces}.") pieces = [sp.id_to_piece(i) for i in range(1, num_pieces + 1)] logger.info(f"Encoding vocabulary of size {len(vocab)}.") encoded_vocab = [sp.encode_as_pieces(v) for v in vocab] # Save pieces to file. with open(data_dir / f"{output_prefix}_tokens_{num_pieces}.txt", "w") as f: f.write("\n".join(pieces)) # Save lexicon to a file. with open(data_dir / f"{output_prefix}_lex_{num_pieces}.txt", "w") as f: for v, p in zip(vocab, encoded_vocab): f.write(f"{v} {' '.join(p)}\n")
def __init__( self, dict, dict_type, pad=constants.PAD, eos=constants.EOS, unk=constants.UNK, bos=constants.BOS, ): self.bos_word, self.unk_word, self.pad_word, self.eos_word = bos, unk, pad, eos self.dict = os.path.expanduser(dict) self.dict_type = dict_type if self.dict_type == SENTENCEPIECE: assert self.exists(self.dict, self.dict_type) self.bpe_dict = SentencePieceProcessor() self.bpe_dict.load(f'{self.dict}.model') self.pad_index = self.bpe_dict.pad_id() self.bos_index = self.bpe_dict.bos_id() self.eos_index = self.bpe_dict.eos_id() self.unk_index = self.bpe_dict.unk_id()
def _evaluate( model, loader, sp: spm.SentencePieceProcessor, use_cuda=True, num_to_print=8, beam_search_k=5, max_decode_len=20, loss_type="nll_token" ): model.eval() pad_id = sp.PieceToId("[PAD]") with torch.no_grad(): # with Timer() as t: # # Decode a single batch by beam search for visualization # X, Y, X_lengths, _ = next(iter(loader)) # X, Y = X[:num_to_print], Y[:num_to_print] # if use_cuda: # X = X.cuda() # X_lengths = X.cuda() # pred, scores = beam_search_decode(model, X, X_lengths, sp, k=beam_search_k, max_decode_len=max_decode_len) # for i in range(X.size(0)): # logger.info(f"Eval X: \t\t\t{ids_to_strs(X[i], sp)}") # logger.info(f"Eval GT Y:\t\t\t{ids_to_strs(Y[i], sp)}") # for b in range(scores.size(1)): # logger.info(f"Eval beam (score={scores[i, b]:.3f}):\t{pred[i][b]}") # logger.debug(f"Decode time for {num_to_print} samples took {t.interval:.3f}") with Timer() as t: # Compute average loss total_loss = 0 num_examples = 0 pbar = tqdm.tqdm(loader, desc="evalaute") for X, Y, X_lengths, Y_lengths in pbar: if use_cuda: X, Y = X.cuda(), Y.cuda() X_lengths, Y_lengths = X_lengths.cuda(), Y_lengths.cuda() # NOTE: X and Y are [B, max_seq_len] tensors (batch first) logits = model(X, Y[:, :-1], X_lengths, Y_lengths) if loss_type == "nll_sequence": loss = F.cross_entropy(logits.transpose(1, 2), Y[:, 1:], ignore_index=pad_id, reduction="sum") loss = loss / X.size(0) # Average over num sequences, not target sequence lengths # Thus, minimize bits per sequence. elif loss_type == "nll_token": loss = F.cross_entropy( logits.transpose(1, 2), Y[:, 1:], ignore_index=pad_id, ) # TODO: Compute Precision/Recall/F1 and BLEU total_loss += loss.item() * X.size(0) num_examples += X.size(0) avg_loss = total_loss / num_examples pbar.set_description(f"evaluate average loss {avg_loss:.4f}") logger.debug(f"Loss calculation took {t.interval:.3f}s") return avg_loss
def main(): parser = ArgumentParser() parser.add_argument("--model", required=True, help="sentencepiece model to use for decoding") parser.add_argument("--input", default="-", help="input file to decode") parser.add_argument("--input_format", choices=["piece", "id"], default="piece") args = parser.parse_args() sp = SentencePieceProcessor() sp.Load(args.model) if args.input_format == "piece": def decode(l): return "".join(sp.DecodePieces(l)) elif args.input_format == "id": def decode(l): return "".join(sp.DecodeIds(l)) def tok2int(tok): # remap reference-side <unk> to 0 return int(tok) if tok != "<unk>" else 0 if args.input == "-": if args.input_format == "id": for line in sys.stdin: print(decode(list(map(tok2int, line.rstrip().split())))) elif args.input_format == "piece": for line in sys.stdin: print(decode(line.rstrip().split())) else: with open(args.input, "r", encoding="utf-8") as h: if args.input_format == "id": for line in h: print(decode(list(map(tok2int, line.rstrip().split())))) elif args.input_format == "piece": for line in h: print(decode(line.rstrip().split()))
class SentencePieceTokenizer: def __init__(self, spm_file, do_lower_case=True): if not os.path.exists(spm_file): raise ValueError( "Can't find spm_file \"%s\". " "Please pass the correct path of sentence-piece model file, " "e.g.`spiece.model`." % spm_file ) self.processor = SentencePieceProcessor() self.processor.Load(spm_file) self.do_lower_case = do_lower_case def tokenize(self, text): text = preprocess_text(text, lower=self.do_lower_case) pieces = encode_pieces(self.processor, text, sample=False) return pieces def convert_tokens_to_ids(self, tokens): return [self.processor.PieceToId(piece) for piece in tokens] def convert_ids_to_tokens(self, ids): pieces = [self.processor.IdToPiece(_id) for _id in ids] return pieces
def graph_gen_helper( triples: Iterable[Tuple[str, str, str]], entities: Sequence[str], sp: spm.SentencePieceProcessor, dropout_p: Optional[float] = None ) -> Tuple[Tuple[List[List[int]], List[bool], List[int]], List[List[int]]]: graph = nx.DiGraph() # Indexed labels for entity nodes ent2nodeIds, token_node_labels, positions = tokenize_entities( entities, sp, sample_alpha=dropout_p) same_ent_idx = compute_same_ent_idx(ent2nodeIds) # (1) Entity Nodes graph.add_nodes_from(range(len(token_node_labels))) do_sample = dropout_p is not None # (2)+(3) Relations and Edges rel_node_labels = [] for trip in triples: e1 = trip[0] e2 = trip[2] rel_label = trip[1] num_nodes_before = len(graph) rel_token_indices = sp.encode( rel_label, enable_sampling=do_sample, alpha=dropout_p) rel_node_labels.extend(rel_token_indices) rel_nodes = list( range(num_nodes_before, num_nodes_before+len(rel_token_indices))) graph.add_nodes_from(rel_nodes) # same text links for (i1, n1), (i2, n2) in combinations(enumerate(rel_nodes), 2): same_ent_idx[(n1, n2)] = i2 - i1 same_ent_idx[(n2, n1)] = i1 - i2 # edges for reln in rel_nodes: for n1 in ent2nodeIds[e1]: graph.add_edge(n1, reln) for n2 in ent2nodeIds[e2]: graph.add_edge(reln, n2) distance_matrix = compute_dm(graph, same_ent_idx) is_entity = [True] * (len(token_node_labels) + len(rel_node_labels)) return (distance_matrix, is_entity, positions), token_node_labels + rel_node_labels
def to_input_tensor(cls, sub_tokens_list: List[List[str]], bpe_model: spm.SentencePieceProcessor, pad_id=0) -> torch.Tensor: max_subword_num = max(len(x) for x in sub_tokens_list) idx_tensor = torch.zeros(len(sub_tokens_list), max_subword_num, dtype=torch.long) idx_tensor.fill_(pad_id) for i, token_list in enumerate(sub_tokens_list): for j, token in enumerate(token_list): idx_tensor[i, j] = bpe_model.piece_to_id(token) return idx_tensor