Example #1
0
 def load_vocab(self, codes_path, vocab_path):
     self.tokenizer = fastBPE.fastBPE(codes_path, vocab_path)
     with open(vocab_path, 'r') as f:
         vocabs = [l.split(' ')[0].strip() for l in f.readlines()]
         for idx, word in enumerate(self.special_tokens + vocabs, 0):
             self.word2idx[word] = idx
             self.idx2word[idx] = word
Example #2
0
def addLanguageModelFeatures(translations, FairseqWrapper, dataSet, lmModel):
    bpe = fastBPE.fastBPE(const.BPE_CODE)
    translation_text = [translation.hypothesis for translation in translations]
    bpe_text = bpe.apply(translation_text)

    bpe_translations = open(const.BPE_TRANSLATIONS, "w")
    bpe_translations.writelines(bpe_text)
    bpe_translations.close()

    FairseqWrapper.runFairseqPreprocessLM(const.BPE_DICTIONARY,
                                          dataSet + "pref",
                                          const.BPE_TRANSLATIONS,
                                          const.BPE_PREPROCESSED_TRNS)
    FairseqWrapper.runFairseqEvalLM(const.BPE_PREPROCESSED_TRNS, lmModel, 128,
                                    1024, dataSet, const.TRANSLATION_LM_SCORE)

    translation_lm_scores = open(const.TRANSLATION_LM_SCORE, 'r')
    for translation in translation_lm_scores:
        index = translation.split(" ")[0]
        if index.isdigit():
            scores = translation.split("[")[1:]
            lmScore = mean([float(i.split("]")[0]) for i in scores])
            translations[int(index)].lmScore = lmScore

    translation_lm_scores.close()
Example #3
0
    def __init__(self,
                 vocab_file,
                 merges_file,
                 normalization=False,
                 bos_token="<s>",
                 eos_token="</s>",
                 sep_token="</s>",
                 cls_token="<s>",
                 unk_token="<unk>",
                 pad_token="<pad>",
                 mask_token="<mask>",
                 **kwargs):
        super().__init__(
            max_len=128,
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            sep_token=sep_token,
            cls_token=cls_token,
            pad_token=pad_token,
            mask_token=mask_token,
            **kwargs,
        )
        self.vocab = Dictionary()
        self.vocab.add_from_file(vocab_file)
        self.bpe = fastBPE.fastBPE(merges_file)

        self.vocab_file = vocab_file
        self.merges_file = merges_file
        self.normalization = normalization

        self.tokenizerTweet = TweetTokenizer()
Example #4
0
    def __init__(self,
                 vocab_file,
                 merges_file,
                 bos_token="<s>",
                 eos_token="</s>",
                 sep_token="</s>",
                 cls_token="<s>",
                 unk_token="<unk>",
                 pad_token="<pad>",
                 mask_token="<mask>",
                 **kwargs):
        super().__init__(
            max_len=256,
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            sep_token=sep_token,
            cls_token=cls_token,
            pad_token=pad_token,
            mask_token=mask_token,
            **kwargs,
        )
        self.vocab = Dictionary()
        self.vocab.add_from_file(vocab_file)
        self.bpe = fastBPE.fastBPE(merges_file)

        self.vocab_file = vocab_file
        self.merges_file = merges_file
Example #5
0
 def __setstate__(self, state):
     from fastBPE import fastBPE
     with tempfile.NamedTemporaryFile(
     ) as codes, tempfile.NamedTemporaryFile() as vocab:
         codes.write(state['codes'])
         vocab.write(state['vocab'])
         self.bpe = fastBPE(codes.name, vocab.name)
Example #6
0
    def __init__(self, dictionary: Dict[str, int]) -> None:
        super().__init__()

        if not os.path.exists(saving_directory):
            os.makedirs(saving_directory)

        download_file_maybe_extract(L93_CODES_URL,
                                    directory=saving_directory,
                                    check_files=[L93_CODES_FILE])

        download_file_maybe_extract(L93_VOCAB_URL,
                                    directory=saving_directory,
                                    check_files=[L93_VOCAB_FILE])

        self.bpe = fastBPE.fastBPE(saving_directory + L93_CODES_FILE,
                                   saving_directory + L93_VOCAB_FILE)
        self.bpe_symbol = "@@ "

        # Properties from the base class
        self.stoi = dictionary
        self.itos = [key for key in dictionary.keys()]
        self._pad_index = dictionary["<pad>"]
        self._eos_index = dictionary["</s>"]
        self._unk_index = dictionary["<unk>"]
        self._mask_index = None
Example #7
0
def initialise_bpe():
    global bpe

    FCODES_PATH = LASER + "/models/93langs.fcodes"
    FVOCAB_PATH = LASER + "models/93langs.fvocab"

    bpe = fastBPE.fastBPE(FCODES_PATH, FVOCAB_PATH)
Example #8
0
 def __init__(self, codes_path, vocab_path):
     from fastBPE import fastBPE
     codes_path = get_file_or_url(codes_path)
     vocab_path = get_file_or_url(vocab_path)
     with open(codes_path, 'rb') as rf:
         self.codes = rf.read()
     with open(vocab_path, 'rb') as rf:
         self.vocab = rf.read()
     self.bpe = fastBPE(codes_path, vocab_path)
Example #9
0
def mytest20190509():
    import fastBPE
    bpe = fastBPE.fastBPE("data/processed/en-zh/codes",
                          "data/processed/en-zh/vocab.en-zh")
    re = bpe.apply([
        "Roasted barramundi fish",
        "Centrally managed over a client-server architecture"
    ])
    print(re)
Example #10
0
 def __init__(self, args):
     if args.bpe_codes is None:
         raise ValueError('--bpe-codes is required for --bpe=subword_nmt')
     codes = file_utils.cached_path(args.bpe_codes)
     try:
         import fastBPE
         self.bpe = fastBPE.fastBPE(codes)
         self.bpe_symbol = "@@ "
     except ImportError:
         raise ImportError('Please install fastBPE with: pip install fastBPE')
Example #11
0
 def __init__(self):
     dirname = os.path.dirname(__file__)
     mecab = MeCab.Tagger('-Owakati')
     mecab.parse('')
     self.mecab = mecab
     codes_path = os.path.join(dirname, '93langs.fcodes')
     vocab_path = os.path.join(dirname, '93langs.fvocab')
     self.bpe = fastBPE.fastBPE(codes_path, vocab_path)
     model_path = os.path.join(dirname, 'bilstm.93langs.2018-12-26.pt')
     self.enc = SentenceEncoder(model_path)
Example #12
0
    def __init__(self, codes_path: str) -> None:
        """Initialize the tokenizer.

        Parameters
        ----------
        codes_path : str
            Path to codes file created using
            fastBPE.

        """
        self.bpe = fastBPE.fastBPE(codes_path)
Example #13
0
    def __init__(self, params):
        reloaded = torch.load(params.model_path, map_location='cpu')
#        print(reloaded['dico_word2id']['while'])
#        print(reloaded['dico_word2id']['return'])
#        print(reloaded['dico_word2id']['if'])
        
#        print(reloaded['encoder'].keys())
#        print(reloaded['decoder'].keys())
        reloaded['encoder'] = {(k[len('module.'):] if k.startswith('module.') else k): v for k, v in
                               reloaded['encoder'].items()}
        assert 'decoder' in reloaded or (
            'decoder_0' in reloaded and 'decoder_1' in reloaded)
        if 'decoder' in reloaded:
            decoders_names = ['decoder']
        else:
            decoders_names = ['decoder_0', 'decoder_1']
        for decoder_name in decoders_names:
            reloaded[decoder_name] = {(k[len('module.'):] if k.startswith('module.') else k): v for k, v in
                                      reloaded[decoder_name].items()}

        self.reloaded_params = AttrDict(reloaded['params'])

        # build dictionary / update parameters
        self.dico = Dictionary(
            reloaded['dico_id2word'], reloaded['dico_word2id'], reloaded['dico_counts'])
        assert self.reloaded_params.n_words == len(self.dico)
        assert self.reloaded_params.bos_index == self.dico.index(BOS_WORD)
        assert self.reloaded_params.eos_index == self.dico.index(EOS_WORD)
        assert self.reloaded_params.pad_index == self.dico.index(PAD_WORD)
        assert self.reloaded_params.unk_index == self.dico.index(UNK_WORD)
        assert self.reloaded_params.mask_index == self.dico.index(MASK_WORD)

        # build model / reload weights
        self.reloaded_params['reload_model'] = ','.join([params.model_path] * 2)
        encoder, decoder = build_model(self.reloaded_params, self.dico)

        self.encoder = encoder[0]
        self.encoder.load_state_dict(reloaded['encoder'])
        assert len(reloaded['encoder'].keys()) == len(
            list(p for p, _ in self.encoder.state_dict().items()))

        self.decoder = decoder[0]
        self.decoder.load_state_dict(reloaded['decoder'])
        assert len(reloaded['decoder'].keys()) == len(
            list(p for p, _ in self.decoder.state_dict().items()))

        #self.encoder.to('cpu') #cuda()
        #self.decoder.to('cpu') #cuda()
        self.encoder.cuda()
        self.decoder.cuda()

        self.encoder.eval()
        self.decoder.eval()
        self.bpe_model = fastBPE.fastBPE(os.path.abspath(params.BPE_path))
Example #14
0
    def __init__(self, cfg):
        if cfg.bpe_codes is None:
            raise ValueError("--bpe-codes is required for --bpe=fastbpe")
        codes = file_utils.cached_path(cfg.bpe_codes)
        try:
            import fastBPE

            self.bpe = fastBPE.fastBPE(codes)
            self.bpe_symbol = "@@ "
        except ImportError:
            raise ImportError("Please install fastBPE with: pip install fastBPE")
 def __init__(self, bpe_codes_fn: str, bpe_vocab_fn: str,
              output_lower: bool):
     self.nlp = spacy.load('en_core_web_sm',
                           disable=[
                               'tagger', 'parser', 'ner', 'entity_linker',
                               'textcat', 'entity_ruler', 'sentencizer',
                               'merge_noun_chunks', 'merge_entities',
                               'merge_subtokens'
                           ])
     self.bpe = fastBPE.fastBPE(bpe_codes_fn, bpe_vocab_fn)
     self.output_lower = output_lower
Example #16
0
def apply(file: str, codes: str) -> None:
    start = time.time()
    f = sys.stdin if file == "-" else open(file, mode="r")
    bpe = fastBPE.fastBPE(codes, "")
    for i, line in enumerate(f):
        s = bpe.apply([line[:-1]])[0]
        print(s)

    delay = time.time() - start
    print(
        f"Computed BPE on {i} sentences in {delay:.2f}s, using cython wrapper around cpp implementation",
        file=sys.stderr)
Example #17
0
 def __init__(self, **kwargs):
     """Loads a BPE tokenizer"""
     super(BPEVectorizer1D, self).__init__(kwargs.get('transform_fn'))
     from fastBPE import fastBPE
     self.max_seen = 128
     self.model_file = kwargs.get('model_file')
     self.vocab_file = kwargs.get('vocab_file')
     self.tokenizer = fastBPE(self.model_file, self.vocab_file)
     self.mxlen = kwargs.get('mxlen', -1)
     self.vocab = {
         k: i
         for i, k in enumerate(self.read_vocab(self.vocab_file))
     }
Example #18
0
    def __init__(self,
                 codes_path: str,
                 nltk_tokenize_first: bool = False) -> None:
        """Initialize the tokenizer.

        Parameters
        ----------
        codes_path : str
            Path to codes file created using
            fastBPE.

        """
        self.bpe = fastBPE.fastBPE(codes_path)
        self.nltk_tokenize_first = nltk_tokenize_first
        nltk.download('punkt', quiet=True)
Example #19
0
    def __init__(self, src_lang, tgt_lang):
        model_path = TranscoderClient.get_model_path(src_lang, tgt_lang)
        reloaded = torch.load(model_path, map_location='cpu')
        reloaded['encoder'] = {(k[len('module.'):] if k.startswith('module.') else k): v for k, v in
                               reloaded['encoder'].items()}
        assert 'decoder' in reloaded or (
            'decoder_0' in reloaded and 'decoder_1' in reloaded)
        if 'decoder' in reloaded:
            decoders_names = ['decoder']
        else:
            decoders_names = ['decoder_0', 'decoder_1']
        for decoder_name in decoders_names:
            reloaded[decoder_name] = {(k[len('module.'):] if k.startswith('module.') else k): v for k, v in
                                      reloaded[decoder_name].items()}

        self.reloaded_params = AttrDict(reloaded['params'])

        # build dictionary / update parameters
        self.dico = Dictionary(
            reloaded['dico_id2word'], reloaded['dico_word2id'], reloaded['dico_counts'])
        assert self.reloaded_params.n_words == len(self.dico)
        assert self.reloaded_params.bos_index == self.dico.index(BOS_WORD)
        assert self.reloaded_params.eos_index == self.dico.index(EOS_WORD)
        assert self.reloaded_params.pad_index == self.dico.index(PAD_WORD)
        assert self.reloaded_params.unk_index == self.dico.index(UNK_WORD)
        assert self.reloaded_params.mask_index == self.dico.index(MASK_WORD)

        # build model / reload weights
        self.reloaded_params['reload_model'] = ','.join([model_path] * 2)
        encoder, decoder = build_model(self.reloaded_params, self.dico)

        self.encoder = encoder[0]
        self.encoder.load_state_dict(reloaded['encoder'])
        assert len(reloaded['encoder'].keys()) == len(
            list(p for p, _ in self.encoder.state_dict().items()))

        self.decoder = decoder[0]
        self.decoder.load_state_dict(reloaded['decoder'])
        assert len(reloaded['decoder'].keys()) == len(
            list(p for p, _ in self.decoder.state_dict().items()))

        self.encoder.cuda()
        self.decoder.cuda()

        self.encoder.eval()
        self.decoder.eval()
        self.bpe_model = fastBPE.fastBPE(os.path.abspath(BPE_PATH))
        self.allowed_languages = [lang.value for lang in Languages]
Example #20
0
def to_bpe_py(sentences, codes: str, vocab: str = ""):
    """
    Below is one way to bpe-ize sentences
    Sentences have to be in the BPE format, i.e. tokenized sentences on which you applied fastBPE.
    
    sentences : list of sentence to bpe-ize
    codes : path to the codes of the model
    vocab (optional) : path to the vocab of the model
    
    installation : pip install fastbpe
    """
    #return sentences
    import fastBPE
    #if not os.path.isfile(vocab) :
    #    vocab = ""
    return fastBPE.fastBPE(codes, vocab).apply(sentences)
Example #21
0
    def __init__(self, params):
        self.params = params
        logger.info("")
        assert len(params.langs) == 2, "Need two languages"
        lan0_dict_path = os.path.join(params.data_path, "vocab.%s" % params.langs[0])
        lan1_dict_path = os.path.join(params.data_path, "vocab.%s" % params.langs[1])
        all_dict_path = os.path.join(params.data_path, "vocab.%s-%s" % (params.langs[0], params.langs[1]))
        assert os.path.isfile(lan0_dict_path) and os.path.isfile(lan1_dict_path) and os.path.isfile(all_dict_path)
        logger.info("Converter: Read %s monolingual vocabulary..." % params.id2lang[0])
        self.lan0_vocab = Dictionary.read_vocab(lan0_dict_path)
        logger.info("Converter: Read %s monolingual vocabulary..." % params.id2lang[1])
        self.lan1_vocab = Dictionary.read_vocab(lan1_dict_path)
        logger.info("Converter: Read monolingual vocabulary for both languages...")
        self.all_vocab = Dictionary.read_vocab(all_dict_path)
        self.code_BOS_WORD = self.all_vocab.index(BOS_WORD)
        self.code_EOS_WORD = self.all_vocab.index(EOS_WORD)
        self.code_PAD_WORD = self.all_vocab.index(PAD_WORD)
        self.code_UNK_WORD = self.all_vocab.index(UNK_WORD)

        lan0_para_dict_path = os.path.join(params.data_path,
                                          "dict.%s-%s.%s.a%s" % (params.langs[0], params.langs[1], params.langs[0], 100))
        lan1_para_dict_path = os.path.join(params.data_path,
                                          "dict.%s-%s.%s.a%s" % (params.langs[0], params.langs[1], params.langs[1], 100))
        if params.debug_dict:
            lan0_para_dict_path = lan0_para_dict_path.replace('100', '1000')
            lan1_para_dict_path = lan1_para_dict_path.replace('100', '1000')
        assert os.path.isfile(lan0_para_dict_path) and os.path.isfile(lan1_para_dict_path)
        logger.info("Converter: Read parallel dictionary for language %s..." % params.langs[0])
        self.dict_lan0 = load_para_dict(lan0_para_dict_path)
        logger.info("Converter: Read parallel dictionary for language %s..." % params.langs[1])
        self.dict_lan1 = load_para_dict(lan1_para_dict_path)

        logger.info("Converter: Loading bpe...")
        codes_path = os.path.join(params.data_path, "codes")
        self.bpe = fastBPE.fastBPE(codes_path, all_dict_path)

        self.all_word_counter = 0
        self.changed_word_counter = 0

        # logger.info("Process parallel dictionary for language 0...")
        # convert_number_to_prob(self.dict_lan0)
        # logger.info("Process parallel dictionary for language 1...")
        # convert_number_to_prob(self.dict_lan1)

        logger.info("")
Example #22
0
def init_sentence_encoder():
    global MODEL
    model_dir = Path(__file__).parent / "LASER" / "models"
    encoder_path = model_dir / "bilstm.93langs.2018-12-26.pt"
    bpe_codes = str(model_dir / "93langs.fcodes")
    LOGGER.info(f' - Encoder: loading {encoder_path}')

    encoder = SentenceEncoder(encoder_path,
                              max_sentences=None,
                              max_tokens=12000,
                              sort_kind='mergesort',
                              cpu=True)
    print(encoder)
    bpe = None
    print(bpe_codes.replace('fcodes', 'fvocab'))
    bpe = fastBPE.fastBPE(bpe_codes, bpe_codes.replace('fcodes', 'fvocab'))
    print(bpe)
    MODEL = Model(bpe=bpe, encoder=encoder, tokenizer=tokenize)
Example #23
0
def load_examples(path_to_train_file, vocab, seq_len):
    train_examples = [json.loads(line) for line in open(path_to_train_file)]
    train_codes, train_corpus = zip(*[(example['control_code'], example['text']) for example in train_examples])

    bpe = fastBPE.fastBPE(CODES_FILE, VOCAB_FILE)
    tokenized_train_corpus = bpe.apply(train_corpus)
    tokenized_train_corpus = [preprocess_text(tokenized_train_text) for tokenized_train_text in tokenized_train_corpus]

    examples = []
    for control_code, text in zip(train_codes, tokenized_train_corpus):
        if control_code not in vocab:
            raise RuntimeError(f'{control_code} not in vocab')

        for i in range(0, len(text), seq_len):
            text_chunk = text[i: i + seq_len]
            if len(text_chunk) != seq_len:
                break
            examples.append((control_code, text_chunk))

    return examples
Example #24
0
                                                        config=run_config)


# we now create a serving function from this estimator
# this enables us to load the model once and easily query it multiple times
def serving_input_fn():
    inputs = {'input_1': tf.placeholder(tf.int32, [1, seq_length])}
    return tf.estimator.export.ServingInputReceiver(inputs, inputs)


predict_fn = tf.contrib.predictor.from_estimator(estimator_model,
                                                 serving_input_fn)

# almost there, we now take the user prompt and tokenize with BPE
# load BPE codes
bpe = fastBPE.fastBPE('codes', 'vocab')

temperature = args.temperature
nucleusprob = args.nucleus
penalty = args.penalty
topk = args.topk

while True:
    prompt = raw_input('ENTER PROMPT: ') if not use_py3 else input(
        'ENTER PROMPT: ')

    # tokenize provided prompt
    split_prompt = bpe.apply([prompt])[0].split()
    text = [word2idx[i] for i in split_prompt]

    # pad with 0s and create a mini-batch of 2 (arbitrary, for ease of code)
Example #25
0
    shuffler = SentenceShuffler.chunk_shuffler()
    par_ds = StreamingParallelDataset(cl)
    noise_ds = StreamingCANoiseDataset(cl, shuffler, 0., 0., 0., 1., 0., 0.)
    ds = StreamingChainedDataset(cl, [par_ds, noise_ds])
    b = next(iter(ds))

    import spacy
    import fastBPE

    nlp = spacy.load('en',
                     disable=[
                         'tagger', 'parser', 'ner', 'entity_linker', 'textcat',
                         'entity_ruler', 'sentencizer', 'merge_noun_chunks',
                         'merge_entities', 'merge_subtokens'
                     ])
    bpe = fastBPE.fastBPE('temp/datasets/bcu_enwiki.30000.codes',
                          'temp/datasets/bcu_enwiki_spacy.30000.bpe.vocab')
    # device = 'cpu'

    model = TransformerS2S(len(vocab), config['TransformerS2S']['emb_dim'],
                           config['TransformerS2S']['n_head'],
                           config['TransformerS2S']['ff_dim'],
                           config['TransformerS2S']['num_enc_layers'],
                           config['TransformerS2S']['num_dec_layers'],
                           config['TransformerS2S']['activation'])

    pytorch_total_params = sum(p.numel() for p in model.parameters()
                               if p.requires_grad)

    optimizer = optim.Adam(model.parameters(),
                           lr=config['optimizer']['adam']['lr'],
                           betas=(config['optimizer']['adam']['beta_1'],
 def from_paths(base_puzzle_gen, train_file_path, vocab_file_path, num_tok):
     vocab = BpePuzzleGenerator._read_vocab(vocab_file_path)
     bpe = fastBPE.fastBPE(train_file_path, vocab_file_path)
     return BpePuzzleGenerator(base_puzzle_gen, vocab, bpe, num_tok)
Example #27
0
def BPEfastLoad(line, bpe_codes):
    bpe_vocab = bpe_codes.replace('fcodes', 'fvocab')
    return fastBPE.fastBPE(bpe_codes, bpe_vocab)
Example #28
0
    help=
    'control code to use for this file. must be in the vocabulary, else it will error out.'
)
parser.add_argument(
    '--sequence_len',
    type=int,
    required=True,
    help='sequence length of model being fine-tuned (256 or 512)')

args = parser.parse_args()

path_to_train_file = fname = args.text_file
domain = [args.control_code]

train_text = open(path_to_train_file, 'rb').read().decode(encoding='utf-8')
bpe = fastBPE.fastBPE('../codes', '../vocab')
tokenized_train_text = bpe.apply([
    train_text.encode('ascii', errors='ignore') if not use_py3 else train_text
])[0]  # will NOT work for non-English texts
# if you want to run non-english text, please tokenize separately using ./fast applybpe and then run this script on the .bpe file with utf8 encoding

tokenized_train_text = re.findall(r'\S+|\n', tokenized_train_text)
tokenized_train_text = list(filter(lambda x: x != u'@@', tokenized_train_text))

# load the vocabulary from file
vocab = open('../vocab').read().decode(
    encoding='utf-8').split('\n') if not use_py3 else open(
        '../vocab', encoding='utf-8').read().split('\n')
vocab = list(map(lambda x: x.split(' ')[0], vocab)) + ['<unk>'] + ['\n']
print('{} unique words'.format(len(vocab)))
Example #29
0
 def __init__(self, code_file, vocab_file, dictionary: Dictionary):
     self.bpe = fastBPE.fastBPE(str(code_file), str(vocab_file))
     self.dictionary = dictionary
     self.n_w = 0
     self.n_oov = 0
Example #30
0
import jionlp as jio
import jieba
import fastBPE
from mosestokenizer import *
from pyltp import SentenceSplitter
import nltk.data
from utils import *
from flashtext import KeywordProcessor
keyword_processor = KeywordProcessor()
keyword_processor.add_keyword('Suzhou University', 'Soochow University')

jieba.initialize()
zh_bpe = fastBPE.fastBPE('../data-bin/codes.zh', '../data-bin/dict.zh.txt')
en_bpe = fastBPE.fastBPE('../data-bin/codes.en', '../data-bin/dict.en.txt')
tokenizer = MosesTokenizer()
detokenizer = MosesDetokenizer()
punc_norm = MosesPunctuationNormalizer()
en_spliter = nltk.data.load('tokenizers/punkt/english.pickle')
case_model = '../data-bin/truecase-model.en'
user_dict_path = '../data-bin/user_dict'

case_dict = load_case_model(case_model)
user_dict = load_user_dict(user_dict_path)


def sep_lines(lines):
    sep = []
    for line in lines:
        sep.append(line.count('ENTER'))
    return sep