예제 #1
0
class BPE(object):
    def __init__(self,
                 vocab_config,
                 file_contents=None,
                 vocab_path=None,
                 out_vocab_path='vocab'):
        if vocab_path:
            self.encoder = self.load_vocab(vocab_path)
        else:
            self.encoder = Encoder(vocab_size=32000, pct_bpe=1.0, silent=False)

    def load_vocab(self, vocab_path):
        return Encoder.load(vocab_path)

    def save_vocab(self, path):
        self.encoder.save(path)

    def tokenize(self, line):
        return self.encoder.tokenize(line)

    def vocab_key(self, w):
        UNK = self.encoder.word_vocab[self.encoder.UNK]
        return self.encoder.bpe_vocab.get(w, UNK)

    def transform(self, line):
        return list(
            itertools.chain.from_iterable(
                self.encoder.transform(line, reverse=False,
                                       fixed_length=None)))

    @property
    def vocab_dim(self):
        return len(self.encoder.bpe_vocab)
예제 #2
0
class FullTokenizer(object):
    """Runs end-to-end tokenziation."""
    def __init__(self, language_maps_dir, do_lower_case=True):

        self.byte_decoder = load_obj("byte_decoder", language_maps_dir)
        self.id_to_vocab = load_obj("id_to_vocab", language_maps_dir)
        self.vocab_to_id = load_obj("vocab_to_id", language_maps_dir)
        self.bpe_merges = load_obj("bpe_merges", language_maps_dir)

        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
        self.bpe_tokenizer = Encoder(encoder=self.vocab_to_id,
                                     bpe_merges=self.bpe_merges,
                                     byte_decoder=self.byte_decoder,
                                     unk_token='[UNK]',
                                     unk_id=len(self.byte_decoder) - 6,
                                     spc_token=chr(int("E001", 16)),
                                     spc_id=len(self.byte_decoder) - 5)
        #self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)

    def tokenize(self, text):
        split_tokens = []
        for token in self.basic_tokenizer.tokenize(text):
            for sub_token in self.bpe_tokenizer.tokenize(token):
                split_tokens.append(sub_token)

        return split_tokens

    def convert_tokens_to_ids(self, tokens):
        return convert_by_vocab(self.vocab_to_id, tokens)

    def convert_ids_to_tokens(self, ids):
        return convert_by_vocab(self.id_to_vocab, ids)
예제 #3
0
def run_bpe(params):
    bpe_encoder = Encoder(vocab_size=params.vocab_size,
                          pct_bpe=params.pct_bpe,
                          silent=not params.verbose)
    if params.encoder_load_file:
        sys.stdout.write('Using pre-computed BPE encoder\n')
        sys.stdout.flush()
        bpe_encoder = Encoder.load(params.encoder_load_file)
    else:
        sys.stdout.write('Generating new BPE encoder\n')
        sys.stdout.flush()
        text = open(params.source_file).read().split('\n')
        bpe_encoder.fit(text)
        bpe_encoder.save(params.encoder_save_file)
    f_src = open(params.source_file)
    f_dst = open(params.destination_file, 'w')

    for line in tqdm.tqdm(f_src.readlines()):
        line = line.strip()
        tokens = bpe_encoder.tokenize(line)
        encoded_line = ' '.join(tokens).strip()
        if encoded_line.strip() != '':
            f_dst.write(encoded_line + '\n')
    f_src.close()
    f_dst.close()
예제 #4
0
def train_and_test_lr(xtrain, ytrain, xtest, ytest, xdev, ydev):
    encoder = Encoder(1000, pct_bpe=0.88)
    encoder.fit(xtrain)
    xtrain = [' '.join(encoder.tokenize(name)) for name in xtrain]
    xtest = [' '.join(encoder.tokenize(name)) for name in xtest]
    xdev = [' '.join(encoder.tokenize(name)) for name in xdev]

    vectorizer = CountVectorizer(ngram_range=(1, 2), lowercase=False)
    x_train = vectorizer.fit_transform(xtrain)
    x_test = vectorizer.transform(xtest)
    x_dev = vectorizer.transform(xdev)

    maxC = tuning_lr(x_train, ytrain, x_dev, ydev)
    clf = LogisticRegression(C=maxC)
    clf.fit(x_train.toarray(), ytrain)
    pred = clf.predict(x_test.toarray())
    f1score = f1_score(ytest, pred)
    return f1score
예제 #5
0
def train_and_test_nb(xtrain, ytrain, xtest, ytest, xdev, ydev):
    encoder = Encoder(5000, pct_bpe=0.88)
    encoder.fit(xtrain)
    xtrain = [' '.join(encoder.tokenize(name)) for name in xtrain]
    xtest = [' '.join(encoder.tokenize(name)) for name in xtest]
    xdev = [' '.join(encoder.tokenize(name)) for name in xdev]

    vectorizer = CountVectorizer(ngram_range=(1, 2), lowercase=False)
    x_train = vectorizer.fit_transform(xtrain)
    x_test = vectorizer.transform(xtest)
    x_dev = vectorizer.transform(xdev)

    maxAlpha = tuning_nb(x_train, ytrain, x_dev, ydev)
    clf = MultinomialNB(maxAlpha)
    clf.fit(x_train.toarray(), ytrain)
    pred = clf.predict(x_test.toarray())
    f1score = f1_score(ytest, pred)
    return f1score
class BPETokenizer():
    """ Wrapper class for calling the BPE tokenizer in the bpe module
    """
    def set_train(self, data_file):
        """ Set training data for the BPE tokenizer.

        :param data_file: The file with the data.
        """
        self.data = list(open(data_file).readlines())

    def set_data(self, data):
        """ Set the training data via a list of strings.

        :param data: The data matrix.
        """
        self.data = data

    def train_model(self, iterations=1000, pct_bpe=0.9):
        """ Train the BPE model.

        :param iterations: The number of iterations to perform.
        :param pct_bpe: The percentage of splits to perform.
        """
        self.encoder = Encoder(iterations, pct_bpe=pct_bpe)
        self.encoder.fit([x.lower() for x in self.data])

    def tokenize(self, data):
        """ Tokenize new data with a trained model.

        :param data: The list of strings to tokenize.
        """
        return self.encoder.tokenize(data)

    def save_model(self, model_file):
        """ Save the BPE model to a file.

        :param model_file: The file to save the model to.
        """
        logger.info("Saving BPE model to {}".format(model_file))
        import pickle
        pickle.dump(self.encoder, open(model_file, 'wb'))

    def load_model(self, model_file):
        """ Load the BPE model from a file.

        :param model_file: The file to load the model from.
        """
        logger.info("Loading BPE model from {}".format(model_file))
        import pickle
        self.encoder = pickle.load(open(model_file, 'rb'))
예제 #7
0
from bpe import Encoder

# Generated with http://pythonpsum.com
test_corpus = '''
    Object raspberrypi functools dict kwargs. Gevent raspberrypi functools. Dunder raspberrypi decorator dict didn't lambda zip import pyramid, she lambda iterate?
    Kwargs raspberrypi diversity unit object gevent. Import fall integration decorator unit django yield functools twisted. Dunder integration decorator he she future. Python raspberrypi community pypy. Kwargs integration beautiful test reduce gil python closure. Gevent he integration generator fall test kwargs raise didn't visor he itertools...
    Reduce integration coroutine bdfl he python. Cython didn't integration while beautiful list python didn't nit!
    Object fall diversity 2to3 dunder script. Python fall for: integration exception dict kwargs dunder pycon. Import raspberrypi beautiful test import six web. Future integration mercurial self script web. Return raspberrypi community test she stable.
    Django raspberrypi mercurial unit import yield raspberrypi visual rocksdahouse. Dunder raspberrypi mercurial list reduce class test scipy helmet zip?
'''

encoder = Encoder(200,
                  pct_bpe=0.88)  # params chosen for demonstration purposes
encoder.fit(test_corpus.split('\n'))

example = "Vizzini: He didn't fall? INCONCEIVABLE!"
print(encoder.tokenize(example))
# ['__sow', 'vi', 'z', 'zi', 'ni', '__eow', '__sow', ':', '__eow', 'he', 'didn', "'", 't', 'fall', '__sow', '?', '__eow', '__sow', 'in', 'co', 'n', 'ce', 'iv', 'ab', 'le', '__eow', '__sow', '!', '__eow']
print(next(encoder.transform([example])))
# [26, 108, 79, 104, 72, 24, 26, 117, 24, 9, 11, 8, 12, 10, 26, 90, 24, 26, 154, 56, 37, 149, 80, 169, 84, 24, 26, 156, 24]
print(next(encoder.inverse_transform(encoder.transform([example]))))
# vizzini : he didn ' t fall ? inconceivable !