def string(self, tensor, bpe_symbol=None, escape_unk=False): """Helper for converting a tensor of token indices to a string. Can optionally remove BPE symbols or escape <unk> words. """ if torch.is_tensor(tensor) and tensor.dim() == 2: return '\n'.join(self.string(t) for t in tensor) def token_string(i): if i == self.unk(): return self.unk_string(escape_unk) else: return self[i] sent = ' '.join(token_string(i) for i in tensor if i != self.eos()) return data_utils.process_bpe_symbol(sent, bpe_symbol)
def tokens_to_sentence(self, line, line_tokenizer=tokenize_line, use_unk_sym=True, bpe_symbol=None): if bpe_symbol is not None: return data_utils.process_bpe_symbol(line, bpe_symbol) # use_unk_sym=False when we want to restore original transcripts from # token sequences, e.g., obtain reference to compute WER tokens = line_tokenizer(line) sent = "" for token in tokens: if token == self.space_word: sent += " " elif use_unk_sym and self.index(token) == self.unk_index: sent += self.unk_word elif token != self.pad_word and token != self.eos_word: sent += token return sent.strip()
def create_ner_from_output_tokens(tokens, src_dict, ent_pad_idx=0, ent_eos_idx=21): #if tokens.shape[1] == 1: # bsz X seq_len txts = [] token_lists = [] pad_idx = src_dict.pad() eos_idx = src_dict.eos() if tokens.shape[1] == 1: # initial inference step, we need to return EOS return torch.empty_like(tokens).fill_(ent_eos_idx) output_entities = torch.empty_like(tokens) for i in range(len(tokens)): token_list = [src_dict[idx] for idx in tokens[i][1:]] token_lists.append(token_list) txt = ' '.join(token_list) txt = data_utils.process_bpe_symbol(txt, ' ##') txts.append(txt) docs = list(nlp.pipe(txts)) for i in range(len(token_lists)): doc = docs[i] _, alignments = align_tokens(doc, token_lists[i]) entities = torch.zeros_like(tokens[i]) entities[0] = ent_eos_idx for j in range(1, len(alignments)): spacy_token = doc[j] ent_type = pad_idx + 1 if spacy_token.ent_type_ in ENTITY_TYPES: ent_type += ENTITY_TYPES[spacy_token.ent_type_] for wp_idx in alignments[j]: entities[wp_idx] = ent_type output_entities[i] = entities return output_entities
def string(self, tensor, bpe_symbol=None, escape_unk=False): """Helper for converting a tensor of token indices to a string. Can optionally remove BPE symbols or escape <unk> words. """ raise Exception( "(BERT dict) string function will not work for all indices are IDs instead of words" ) if torch.is_tensor(tensor) and tensor.dim() == 2: return '\n'.join( self.string(t, bpe_symbol, escape_unk) for t in tensor) def token_string(i): if i == self.unk(): return self.unk_string(escape_unk) else: return self[i] sent = ' '.join(token_string(i) for i in tensor if i != self.eos()) return data_utils.process_bpe_symbol(sent, bpe_symbol)
def string( self, tensor, bpe_symbol=None, escape_unk=False, extra_symbols_to_ignore=None, unk_string=None, ): """Helper for converting a tensor of token indices to a string. Can optionally remove BPE symbols or escape <unk> words. """ if torch.is_tensor(tensor) and tensor.dim() == 2: return "\n".join( self.string(t, bpe_symbol, escape_unk, extra_symbols_to_ignore) for t in tensor ) extra_symbols_to_ignore = set(extra_symbols_to_ignore or []) #extra_symbols_to_ignore.add(self.eos()) def token_string(i): if i == self.unk(): if unk_string is not None: return unk_string else: return self.unk_string(escape_unk) else: return self[i] if hasattr(self, "bos_index"): extra_symbols_to_ignore.add(self.bos()) sent = " ".join( token_string(i) for i in tensor if utils.item(i) not in extra_symbols_to_ignore ) return data_utils.process_bpe_symbol(sent, bpe_symbol)
def create_ner(tokens, src_dict): """ Given a 1d Tensor (with token indices), return a list of NER tags """ entities = torch.zeros_like(tokens) token_list = [src_dict[idx] for idx in tokens] pad_idx = src_dict.pad() txt = ' '.join(token_list) txt = data_utils.process_bpe_symbol(txt, ' ##') doc = nlp(txt) _, alignments = align_tokens(doc, token_list) for i in range(len(alignments)): spacy_token = doc[i] ent_type = pad_idx + 1 if spacy_token.ent_type_ in ENTITY_TYPES: ent_type += ENTITY_TYPES[spacy_token.ent_type_] for wp_idx in alignments[i]: entities[wp_idx] = ent_type return entities
def string(self, tensor, bpe_symbol=None, escape_unk=False): """Helper for converting a tensor of token indices to a string. Can optionally remove BPE symbols or escape <unk> words. """ #print("self is",self.indices) if torch.is_tensor(tensor) and tensor.dim() == 2: return "\n".join( self.string(t, bpe_symbol, escape_unk) for t in tensor) def token_string(i): if i == self.unk(): return self.unk_string(escape_unk) else: return self[i] if hasattr(self, "bos_index"): sent = " ".join( token_string(i) for i in tensor if (i != self.eos()) and (i != self.bos())) else: sent = " ".join(token_string(i) for i in tensor if i != self.eos()) return data_utils.process_bpe_symbol(sent, bpe_symbol)
def __init__(self, task, models, args, src_bpe=None, bpe_symbol='@@ '): self.task = task self.models = models self.src_dict = task.source_dictionary self.tgt_dict = task.target_dictionary self.src_bpe = src_bpe self.use_cuda = torch.cuda.is_available() and not args.cpu self.args = args # optimize model for generation for model in self.models: model.make_generation_fast_( beamable_mm_beam_size=None if self.args.no_beamable_mm else self.args.beam, need_attn=args.print_alignment, ) if args.fp16: model.half() if self.use_cuda: model.cuda() self.generator = self.task.build_generator(args) # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) self.align_dict = utils.load_align_dict(args.replace_unk) self.max_positions = utils.resolve_max_positions( self.task.max_positions(), *[model.max_positions() for model in models] ) self.in_transforms = [] self.out_transforms = [] if getattr(args, 'moses', False): tokenizer = MosesTokenizer(lang=args.source_lang or 'en') detokenizer = MosesDetokenizer(lang=args.target_lang or 'en') self.in_transforms.append(lambda s: tokenizer.tokenize(s, return_str=True)) self.out_transforms.append(lambda s: detokenizer.detokenize(s.split())) elif getattr(args, 'nltk', False): from nltk.tokenize import word_tokenize self.in_transforms.append(lambda s: ' '.join(word_tokenize(s))) if getattr(args, 'gpt2_bpe', False): from fairseq.gpt2_bpe.gpt2_encoding import get_encoder encoder_json = os.path.join(os.path.dirname(src_bpe), 'encoder.json') vocab_bpe = src_bpe encoder = get_encoder(encoder_json, vocab_bpe) self.in_transforms.append(lambda s: ' '.join(map(str, encoder.encode(s)))) self.out_transforms.append(lambda s: ' '.join(t for t in s.split() if t != '<unk>')) self.out_transforms.append(lambda s: encoder.decode(map(int, s.strip().split()))) elif getattr(args, 'sentencepiece', False): import sentencepiece as spm sp = spm.SentencePieceProcessor() sp.Load(src_bpe) self.in_transforms.append(lambda s: ' '.join(sp.EncodeAsPieces(s))) self.out_transforms.append(lambda s: data_utils.process_bpe_symbol(s, 'sentencepiece')) elif src_bpe is not None: bpe_parser = apply_bpe.create_parser() bpe_args = bpe_parser.parse_args(['--codes', self.src_bpe]) bpe = apply_bpe.BPE(bpe_args.codes, bpe_args.merges, bpe_args.separator, None, bpe_args.glossaries) self.in_transforms.append(lambda s: bpe.process_line(s)) self.out_transforms.append(lambda s: data_utils.process_bpe_symbol(s, bpe_symbol))