def load_vocab(self, codes_path, vocab_path): self.tokenizer = fastBPE.fastBPE(codes_path, vocab_path) with open(vocab_path, 'r') as f: vocabs = [l.split(' ')[0].strip() for l in f.readlines()] for idx, word in enumerate(self.special_tokens + vocabs, 0): self.word2idx[word] = idx self.idx2word[idx] = word
def addLanguageModelFeatures(translations, FairseqWrapper, dataSet, lmModel): bpe = fastBPE.fastBPE(const.BPE_CODE) translation_text = [translation.hypothesis for translation in translations] bpe_text = bpe.apply(translation_text) bpe_translations = open(const.BPE_TRANSLATIONS, "w") bpe_translations.writelines(bpe_text) bpe_translations.close() FairseqWrapper.runFairseqPreprocessLM(const.BPE_DICTIONARY, dataSet + "pref", const.BPE_TRANSLATIONS, const.BPE_PREPROCESSED_TRNS) FairseqWrapper.runFairseqEvalLM(const.BPE_PREPROCESSED_TRNS, lmModel, 128, 1024, dataSet, const.TRANSLATION_LM_SCORE) translation_lm_scores = open(const.TRANSLATION_LM_SCORE, 'r') for translation in translation_lm_scores: index = translation.split(" ")[0] if index.isdigit(): scores = translation.split("[")[1:] lmScore = mean([float(i.split("]")[0]) for i in scores]) translations[int(index)].lmScore = lmScore translation_lm_scores.close()
def __init__(self, vocab_file, merges_file, normalization=False, bos_token="<s>", eos_token="</s>", sep_token="</s>", cls_token="<s>", unk_token="<unk>", pad_token="<pad>", mask_token="<mask>", **kwargs): super().__init__( max_len=128, bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, sep_token=sep_token, cls_token=cls_token, pad_token=pad_token, mask_token=mask_token, **kwargs, ) self.vocab = Dictionary() self.vocab.add_from_file(vocab_file) self.bpe = fastBPE.fastBPE(merges_file) self.vocab_file = vocab_file self.merges_file = merges_file self.normalization = normalization self.tokenizerTweet = TweetTokenizer()
def __init__(self, vocab_file, merges_file, bos_token="<s>", eos_token="</s>", sep_token="</s>", cls_token="<s>", unk_token="<unk>", pad_token="<pad>", mask_token="<mask>", **kwargs): super().__init__( max_len=256, bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, sep_token=sep_token, cls_token=cls_token, pad_token=pad_token, mask_token=mask_token, **kwargs, ) self.vocab = Dictionary() self.vocab.add_from_file(vocab_file) self.bpe = fastBPE.fastBPE(merges_file) self.vocab_file = vocab_file self.merges_file = merges_file
def __setstate__(self, state): from fastBPE import fastBPE with tempfile.NamedTemporaryFile( ) as codes, tempfile.NamedTemporaryFile() as vocab: codes.write(state['codes']) vocab.write(state['vocab']) self.bpe = fastBPE(codes.name, vocab.name)
def __init__(self, dictionary: Dict[str, int]) -> None: super().__init__() if not os.path.exists(saving_directory): os.makedirs(saving_directory) download_file_maybe_extract(L93_CODES_URL, directory=saving_directory, check_files=[L93_CODES_FILE]) download_file_maybe_extract(L93_VOCAB_URL, directory=saving_directory, check_files=[L93_VOCAB_FILE]) self.bpe = fastBPE.fastBPE(saving_directory + L93_CODES_FILE, saving_directory + L93_VOCAB_FILE) self.bpe_symbol = "@@ " # Properties from the base class self.stoi = dictionary self.itos = [key for key in dictionary.keys()] self._pad_index = dictionary["<pad>"] self._eos_index = dictionary["</s>"] self._unk_index = dictionary["<unk>"] self._mask_index = None
def initialise_bpe(): global bpe FCODES_PATH = LASER + "/models/93langs.fcodes" FVOCAB_PATH = LASER + "models/93langs.fvocab" bpe = fastBPE.fastBPE(FCODES_PATH, FVOCAB_PATH)
def __init__(self, codes_path, vocab_path): from fastBPE import fastBPE codes_path = get_file_or_url(codes_path) vocab_path = get_file_or_url(vocab_path) with open(codes_path, 'rb') as rf: self.codes = rf.read() with open(vocab_path, 'rb') as rf: self.vocab = rf.read() self.bpe = fastBPE(codes_path, vocab_path)
def mytest20190509(): import fastBPE bpe = fastBPE.fastBPE("data/processed/en-zh/codes", "data/processed/en-zh/vocab.en-zh") re = bpe.apply([ "Roasted barramundi fish", "Centrally managed over a client-server architecture" ]) print(re)
def __init__(self, args): if args.bpe_codes is None: raise ValueError('--bpe-codes is required for --bpe=subword_nmt') codes = file_utils.cached_path(args.bpe_codes) try: import fastBPE self.bpe = fastBPE.fastBPE(codes) self.bpe_symbol = "@@ " except ImportError: raise ImportError('Please install fastBPE with: pip install fastBPE')
def __init__(self): dirname = os.path.dirname(__file__) mecab = MeCab.Tagger('-Owakati') mecab.parse('') self.mecab = mecab codes_path = os.path.join(dirname, '93langs.fcodes') vocab_path = os.path.join(dirname, '93langs.fvocab') self.bpe = fastBPE.fastBPE(codes_path, vocab_path) model_path = os.path.join(dirname, 'bilstm.93langs.2018-12-26.pt') self.enc = SentenceEncoder(model_path)
def __init__(self, codes_path: str) -> None: """Initialize the tokenizer. Parameters ---------- codes_path : str Path to codes file created using fastBPE. """ self.bpe = fastBPE.fastBPE(codes_path)
def __init__(self, params): reloaded = torch.load(params.model_path, map_location='cpu') # print(reloaded['dico_word2id']['while']) # print(reloaded['dico_word2id']['return']) # print(reloaded['dico_word2id']['if']) # print(reloaded['encoder'].keys()) # print(reloaded['decoder'].keys()) reloaded['encoder'] = {(k[len('module.'):] if k.startswith('module.') else k): v for k, v in reloaded['encoder'].items()} assert 'decoder' in reloaded or ( 'decoder_0' in reloaded and 'decoder_1' in reloaded) if 'decoder' in reloaded: decoders_names = ['decoder'] else: decoders_names = ['decoder_0', 'decoder_1'] for decoder_name in decoders_names: reloaded[decoder_name] = {(k[len('module.'):] if k.startswith('module.') else k): v for k, v in reloaded[decoder_name].items()} self.reloaded_params = AttrDict(reloaded['params']) # build dictionary / update parameters self.dico = Dictionary( reloaded['dico_id2word'], reloaded['dico_word2id'], reloaded['dico_counts']) assert self.reloaded_params.n_words == len(self.dico) assert self.reloaded_params.bos_index == self.dico.index(BOS_WORD) assert self.reloaded_params.eos_index == self.dico.index(EOS_WORD) assert self.reloaded_params.pad_index == self.dico.index(PAD_WORD) assert self.reloaded_params.unk_index == self.dico.index(UNK_WORD) assert self.reloaded_params.mask_index == self.dico.index(MASK_WORD) # build model / reload weights self.reloaded_params['reload_model'] = ','.join([params.model_path] * 2) encoder, decoder = build_model(self.reloaded_params, self.dico) self.encoder = encoder[0] self.encoder.load_state_dict(reloaded['encoder']) assert len(reloaded['encoder'].keys()) == len( list(p for p, _ in self.encoder.state_dict().items())) self.decoder = decoder[0] self.decoder.load_state_dict(reloaded['decoder']) assert len(reloaded['decoder'].keys()) == len( list(p for p, _ in self.decoder.state_dict().items())) #self.encoder.to('cpu') #cuda() #self.decoder.to('cpu') #cuda() self.encoder.cuda() self.decoder.cuda() self.encoder.eval() self.decoder.eval() self.bpe_model = fastBPE.fastBPE(os.path.abspath(params.BPE_path))
def __init__(self, cfg): if cfg.bpe_codes is None: raise ValueError("--bpe-codes is required for --bpe=fastbpe") codes = file_utils.cached_path(cfg.bpe_codes) try: import fastBPE self.bpe = fastBPE.fastBPE(codes) self.bpe_symbol = "@@ " except ImportError: raise ImportError("Please install fastBPE with: pip install fastBPE")
def __init__(self, bpe_codes_fn: str, bpe_vocab_fn: str, output_lower: bool): self.nlp = spacy.load('en_core_web_sm', disable=[ 'tagger', 'parser', 'ner', 'entity_linker', 'textcat', 'entity_ruler', 'sentencizer', 'merge_noun_chunks', 'merge_entities', 'merge_subtokens' ]) self.bpe = fastBPE.fastBPE(bpe_codes_fn, bpe_vocab_fn) self.output_lower = output_lower
def apply(file: str, codes: str) -> None: start = time.time() f = sys.stdin if file == "-" else open(file, mode="r") bpe = fastBPE.fastBPE(codes, "") for i, line in enumerate(f): s = bpe.apply([line[:-1]])[0] print(s) delay = time.time() - start print( f"Computed BPE on {i} sentences in {delay:.2f}s, using cython wrapper around cpp implementation", file=sys.stderr)
def __init__(self, **kwargs): """Loads a BPE tokenizer""" super(BPEVectorizer1D, self).__init__(kwargs.get('transform_fn')) from fastBPE import fastBPE self.max_seen = 128 self.model_file = kwargs.get('model_file') self.vocab_file = kwargs.get('vocab_file') self.tokenizer = fastBPE(self.model_file, self.vocab_file) self.mxlen = kwargs.get('mxlen', -1) self.vocab = { k: i for i, k in enumerate(self.read_vocab(self.vocab_file)) }
def __init__(self, codes_path: str, nltk_tokenize_first: bool = False) -> None: """Initialize the tokenizer. Parameters ---------- codes_path : str Path to codes file created using fastBPE. """ self.bpe = fastBPE.fastBPE(codes_path) self.nltk_tokenize_first = nltk_tokenize_first nltk.download('punkt', quiet=True)
def __init__(self, src_lang, tgt_lang): model_path = TranscoderClient.get_model_path(src_lang, tgt_lang) reloaded = torch.load(model_path, map_location='cpu') reloaded['encoder'] = {(k[len('module.'):] if k.startswith('module.') else k): v for k, v in reloaded['encoder'].items()} assert 'decoder' in reloaded or ( 'decoder_0' in reloaded and 'decoder_1' in reloaded) if 'decoder' in reloaded: decoders_names = ['decoder'] else: decoders_names = ['decoder_0', 'decoder_1'] for decoder_name in decoders_names: reloaded[decoder_name] = {(k[len('module.'):] if k.startswith('module.') else k): v for k, v in reloaded[decoder_name].items()} self.reloaded_params = AttrDict(reloaded['params']) # build dictionary / update parameters self.dico = Dictionary( reloaded['dico_id2word'], reloaded['dico_word2id'], reloaded['dico_counts']) assert self.reloaded_params.n_words == len(self.dico) assert self.reloaded_params.bos_index == self.dico.index(BOS_WORD) assert self.reloaded_params.eos_index == self.dico.index(EOS_WORD) assert self.reloaded_params.pad_index == self.dico.index(PAD_WORD) assert self.reloaded_params.unk_index == self.dico.index(UNK_WORD) assert self.reloaded_params.mask_index == self.dico.index(MASK_WORD) # build model / reload weights self.reloaded_params['reload_model'] = ','.join([model_path] * 2) encoder, decoder = build_model(self.reloaded_params, self.dico) self.encoder = encoder[0] self.encoder.load_state_dict(reloaded['encoder']) assert len(reloaded['encoder'].keys()) == len( list(p for p, _ in self.encoder.state_dict().items())) self.decoder = decoder[0] self.decoder.load_state_dict(reloaded['decoder']) assert len(reloaded['decoder'].keys()) == len( list(p for p, _ in self.decoder.state_dict().items())) self.encoder.cuda() self.decoder.cuda() self.encoder.eval() self.decoder.eval() self.bpe_model = fastBPE.fastBPE(os.path.abspath(BPE_PATH)) self.allowed_languages = [lang.value for lang in Languages]
def to_bpe_py(sentences, codes: str, vocab: str = ""): """ Below is one way to bpe-ize sentences Sentences have to be in the BPE format, i.e. tokenized sentences on which you applied fastBPE. sentences : list of sentence to bpe-ize codes : path to the codes of the model vocab (optional) : path to the vocab of the model installation : pip install fastbpe """ #return sentences import fastBPE #if not os.path.isfile(vocab) : # vocab = "" return fastBPE.fastBPE(codes, vocab).apply(sentences)
def __init__(self, params): self.params = params logger.info("") assert len(params.langs) == 2, "Need two languages" lan0_dict_path = os.path.join(params.data_path, "vocab.%s" % params.langs[0]) lan1_dict_path = os.path.join(params.data_path, "vocab.%s" % params.langs[1]) all_dict_path = os.path.join(params.data_path, "vocab.%s-%s" % (params.langs[0], params.langs[1])) assert os.path.isfile(lan0_dict_path) and os.path.isfile(lan1_dict_path) and os.path.isfile(all_dict_path) logger.info("Converter: Read %s monolingual vocabulary..." % params.id2lang[0]) self.lan0_vocab = Dictionary.read_vocab(lan0_dict_path) logger.info("Converter: Read %s monolingual vocabulary..." % params.id2lang[1]) self.lan1_vocab = Dictionary.read_vocab(lan1_dict_path) logger.info("Converter: Read monolingual vocabulary for both languages...") self.all_vocab = Dictionary.read_vocab(all_dict_path) self.code_BOS_WORD = self.all_vocab.index(BOS_WORD) self.code_EOS_WORD = self.all_vocab.index(EOS_WORD) self.code_PAD_WORD = self.all_vocab.index(PAD_WORD) self.code_UNK_WORD = self.all_vocab.index(UNK_WORD) lan0_para_dict_path = os.path.join(params.data_path, "dict.%s-%s.%s.a%s" % (params.langs[0], params.langs[1], params.langs[0], 100)) lan1_para_dict_path = os.path.join(params.data_path, "dict.%s-%s.%s.a%s" % (params.langs[0], params.langs[1], params.langs[1], 100)) if params.debug_dict: lan0_para_dict_path = lan0_para_dict_path.replace('100', '1000') lan1_para_dict_path = lan1_para_dict_path.replace('100', '1000') assert os.path.isfile(lan0_para_dict_path) and os.path.isfile(lan1_para_dict_path) logger.info("Converter: Read parallel dictionary for language %s..." % params.langs[0]) self.dict_lan0 = load_para_dict(lan0_para_dict_path) logger.info("Converter: Read parallel dictionary for language %s..." % params.langs[1]) self.dict_lan1 = load_para_dict(lan1_para_dict_path) logger.info("Converter: Loading bpe...") codes_path = os.path.join(params.data_path, "codes") self.bpe = fastBPE.fastBPE(codes_path, all_dict_path) self.all_word_counter = 0 self.changed_word_counter = 0 # logger.info("Process parallel dictionary for language 0...") # convert_number_to_prob(self.dict_lan0) # logger.info("Process parallel dictionary for language 1...") # convert_number_to_prob(self.dict_lan1) logger.info("")
def init_sentence_encoder(): global MODEL model_dir = Path(__file__).parent / "LASER" / "models" encoder_path = model_dir / "bilstm.93langs.2018-12-26.pt" bpe_codes = str(model_dir / "93langs.fcodes") LOGGER.info(f' - Encoder: loading {encoder_path}') encoder = SentenceEncoder(encoder_path, max_sentences=None, max_tokens=12000, sort_kind='mergesort', cpu=True) print(encoder) bpe = None print(bpe_codes.replace('fcodes', 'fvocab')) bpe = fastBPE.fastBPE(bpe_codes, bpe_codes.replace('fcodes', 'fvocab')) print(bpe) MODEL = Model(bpe=bpe, encoder=encoder, tokenizer=tokenize)
def load_examples(path_to_train_file, vocab, seq_len): train_examples = [json.loads(line) for line in open(path_to_train_file)] train_codes, train_corpus = zip(*[(example['control_code'], example['text']) for example in train_examples]) bpe = fastBPE.fastBPE(CODES_FILE, VOCAB_FILE) tokenized_train_corpus = bpe.apply(train_corpus) tokenized_train_corpus = [preprocess_text(tokenized_train_text) for tokenized_train_text in tokenized_train_corpus] examples = [] for control_code, text in zip(train_codes, tokenized_train_corpus): if control_code not in vocab: raise RuntimeError(f'{control_code} not in vocab') for i in range(0, len(text), seq_len): text_chunk = text[i: i + seq_len] if len(text_chunk) != seq_len: break examples.append((control_code, text_chunk)) return examples
config=run_config) # we now create a serving function from this estimator # this enables us to load the model once and easily query it multiple times def serving_input_fn(): inputs = {'input_1': tf.placeholder(tf.int32, [1, seq_length])} return tf.estimator.export.ServingInputReceiver(inputs, inputs) predict_fn = tf.contrib.predictor.from_estimator(estimator_model, serving_input_fn) # almost there, we now take the user prompt and tokenize with BPE # load BPE codes bpe = fastBPE.fastBPE('codes', 'vocab') temperature = args.temperature nucleusprob = args.nucleus penalty = args.penalty topk = args.topk while True: prompt = raw_input('ENTER PROMPT: ') if not use_py3 else input( 'ENTER PROMPT: ') # tokenize provided prompt split_prompt = bpe.apply([prompt])[0].split() text = [word2idx[i] for i in split_prompt] # pad with 0s and create a mini-batch of 2 (arbitrary, for ease of code)
shuffler = SentenceShuffler.chunk_shuffler() par_ds = StreamingParallelDataset(cl) noise_ds = StreamingCANoiseDataset(cl, shuffler, 0., 0., 0., 1., 0., 0.) ds = StreamingChainedDataset(cl, [par_ds, noise_ds]) b = next(iter(ds)) import spacy import fastBPE nlp = spacy.load('en', disable=[ 'tagger', 'parser', 'ner', 'entity_linker', 'textcat', 'entity_ruler', 'sentencizer', 'merge_noun_chunks', 'merge_entities', 'merge_subtokens' ]) bpe = fastBPE.fastBPE('temp/datasets/bcu_enwiki.30000.codes', 'temp/datasets/bcu_enwiki_spacy.30000.bpe.vocab') # device = 'cpu' model = TransformerS2S(len(vocab), config['TransformerS2S']['emb_dim'], config['TransformerS2S']['n_head'], config['TransformerS2S']['ff_dim'], config['TransformerS2S']['num_enc_layers'], config['TransformerS2S']['num_dec_layers'], config['TransformerS2S']['activation']) pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad) optimizer = optim.Adam(model.parameters(), lr=config['optimizer']['adam']['lr'], betas=(config['optimizer']['adam']['beta_1'],
def from_paths(base_puzzle_gen, train_file_path, vocab_file_path, num_tok): vocab = BpePuzzleGenerator._read_vocab(vocab_file_path) bpe = fastBPE.fastBPE(train_file_path, vocab_file_path) return BpePuzzleGenerator(base_puzzle_gen, vocab, bpe, num_tok)
def BPEfastLoad(line, bpe_codes): bpe_vocab = bpe_codes.replace('fcodes', 'fvocab') return fastBPE.fastBPE(bpe_codes, bpe_vocab)
help= 'control code to use for this file. must be in the vocabulary, else it will error out.' ) parser.add_argument( '--sequence_len', type=int, required=True, help='sequence length of model being fine-tuned (256 or 512)') args = parser.parse_args() path_to_train_file = fname = args.text_file domain = [args.control_code] train_text = open(path_to_train_file, 'rb').read().decode(encoding='utf-8') bpe = fastBPE.fastBPE('../codes', '../vocab') tokenized_train_text = bpe.apply([ train_text.encode('ascii', errors='ignore') if not use_py3 else train_text ])[0] # will NOT work for non-English texts # if you want to run non-english text, please tokenize separately using ./fast applybpe and then run this script on the .bpe file with utf8 encoding tokenized_train_text = re.findall(r'\S+|\n', tokenized_train_text) tokenized_train_text = list(filter(lambda x: x != u'@@', tokenized_train_text)) # load the vocabulary from file vocab = open('../vocab').read().decode( encoding='utf-8').split('\n') if not use_py3 else open( '../vocab', encoding='utf-8').read().split('\n') vocab = list(map(lambda x: x.split(' ')[0], vocab)) + ['<unk>'] + ['\n'] print('{} unique words'.format(len(vocab)))
def __init__(self, code_file, vocab_file, dictionary: Dictionary): self.bpe = fastBPE.fastBPE(str(code_file), str(vocab_file)) self.dictionary = dictionary self.n_w = 0 self.n_oov = 0
import jionlp as jio import jieba import fastBPE from mosestokenizer import * from pyltp import SentenceSplitter import nltk.data from utils import * from flashtext import KeywordProcessor keyword_processor = KeywordProcessor() keyword_processor.add_keyword('Suzhou University', 'Soochow University') jieba.initialize() zh_bpe = fastBPE.fastBPE('../data-bin/codes.zh', '../data-bin/dict.zh.txt') en_bpe = fastBPE.fastBPE('../data-bin/codes.en', '../data-bin/dict.en.txt') tokenizer = MosesTokenizer() detokenizer = MosesDetokenizer() punc_norm = MosesPunctuationNormalizer() en_spliter = nltk.data.load('tokenizers/punkt/english.pickle') case_model = '../data-bin/truecase-model.en' user_dict_path = '../data-bin/user_dict' case_dict = load_case_model(case_model) user_dict = load_user_dict(user_dict_path) def sep_lines(lines): sep = [] for line in lines: sep.append(line.count('ENTER')) return sep