Beispiel #1
0
class BPE(object):
    def __init__(self,
                 vocab_config,
                 file_contents=None,
                 vocab_path=None,
                 out_vocab_path='vocab'):
        if vocab_path:
            self.encoder = self.load_vocab(vocab_path)
        else:
            self.encoder = Encoder(vocab_size=32000, pct_bpe=1.0, silent=False)

    def load_vocab(self, vocab_path):
        return Encoder.load(vocab_path)

    def save_vocab(self, path):
        self.encoder.save(path)

    def tokenize(self, line):
        return self.encoder.tokenize(line)

    def vocab_key(self, w):
        UNK = self.encoder.word_vocab[self.encoder.UNK]
        return self.encoder.bpe_vocab.get(w, UNK)

    def transform(self, line):
        return list(
            itertools.chain.from_iterable(
                self.encoder.transform(line, reverse=False,
                                       fixed_length=None)))

    @property
    def vocab_dim(self):
        return len(self.encoder.bpe_vocab)
def prepare_data(data_path,
                 freq_dist_path,
                 embedding_path,
                 vocabulary_size=10000,
                 embedding_size=200,
                 predict=False,
                 max_length=None,
                 use_bpe=False):
    max_length_provided = max_length is not None

    separator = ","
    if data_path.endswith("tsv"):
        separator = "\t"

    # construct vocabulary
    vocabulary = None
    if not use_bpe:
        with open(freq_dist_path, "rb") as freq_dist_file:
            freq_dist = pickle.load(freq_dist_file)
        vocabulary = {"<pad>": 0, "<unk>": 1, "<user>": 2, "<url>": 3}
        most_common = freq_dist.most_common(vocabulary_size - len(vocabulary))
        vocabulary.update({w[0]: i + 2 for i, w in enumerate(most_common)})
        print("Constructed vocabulary of size {}.".format(vocabulary_size))

    # load data and convert it to indices
    data = []
    labels = []
    if not max_length_provided:
        max_length = 0
    with open(data_path, "r") as data_file:
        lines = data_file.readlines()
        for i, line in enumerate(lines):
            if not predict:
                tweet_id, sentiment, tweet = line.split(separator)
            else:
                tweet_id, tweet = line.split(separator)
            data.append(tweet.strip())

            if not predict:
                labels.append(int(sentiment))
    print("Loaded data ({} tweets).".format(len(data)))

    if not use_bpe:
        new_data = []
        for tweet in data:
            words = tweet.split()
            indices = []
            for w_idx, w in enumerate(words):
                if max_length_provided and w_idx == max_length:
                    break

                index = vocabulary.get(w)
                if index is not None:
                    indices.append(index)
                else:
                    indices.append(vocabulary.get("<unk>"))

            if not max_length_provided and len(indices) > max_length:
                max_length = len(indices)

            new_data.append(indices)
        data = new_data

        pad_value = vocabulary.get("<pad>")
    else:
        print("Training BPE encoder...")
        encoder = Encoder(vocab_size=vocabulary_size,
                          required_tokens=["<user>", "<url>"],
                          UNK="<unk>",
                          PAD="<pad>")
        encoder.fit(data)
        vocabulary = encoder.vocabs_to_dict()
        print("Constructed BPE vocabulary of size {}.".format(vocabulary_size))

        new_data = []
        for tweet in data:
            indices = list(next(encoder.transform([tweet])))
            if not max_length_provided and len(indices) > max_length:
                max_length = len(indices)
            new_data.append(indices)
        data = new_data

        pad_value = encoder.word_vocab[encoder.PAD]

    # load embedding vectors
    embedding_vectors = {}
    if not use_bpe:
        with open(embedding_path, "r") as glove_file:
            for i, line in enumerate(glove_file):
                tokens = line.split()
                word = tokens[0]
                if vocabulary.get(word):
                    vector = [float(e) for e in tokens[1:]]
                    embedding_vectors[word] = np.array(vector)
        print("Found {} GLOVE vectors for vocabulary of size {}.".format(
            len(embedding_vectors), len(vocabulary)))
        print(
            "Loaded embedding vectors ({} dimensions).".format(embedding_size))

    # construct embedding matrix
    embedding_matrix = np.random.randn(vocabulary_size, embedding_size) * 0.01
    if not use_bpe:
        for word, i in list(vocabulary.items()):
            embedding_vector = embedding_vectors.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
    print("Constructed embedding matrix.")

    # pad data (might want to change max_length to be CLI argument)
    data = pad_sequences(data,
                         maxlen=max_length,
                         padding="post",
                         value=pad_value)
    if not predict:
        labels = np.array(labels)
    print("Padded sequences to length {}.".format(max_length))

    if not predict:
        return vocabulary, data, labels, embedding_matrix
    return vocabulary, data, embedding_matrix
Beispiel #3
0
def parse(x):
    return x.split()

enc = Encoder(4096, ngram_min=1, ngram_max=2, pct_bpe=0.8, silent=True, word_tokenizer=parse)
enc.fit(sequences)

base = enc.vocabs_to_dict()
duplicate_keys = []
for key in base['byte_pairs']:
    if key in base['words']:
        duplicate_keys.append(key)
if len(duplicate_keys) > 0:
    print("got duplicates:")
    print(duplicate_keys)
else:
    print("NO DUPLICATES! :)")

keybase = {**base['words'], **base['byte_pairs']}


inv_map = {v: k for k, v in keybase.items()}

for i in range(0,10):
    print(i, inv_map[i])
 
sequences = [f for f in enc.transform(tqdm(sequences))]

lengths = [len(x) for x in sequences]
print(max(lengths))

# print(base)
Beispiel #4
0
from bpe import Encoder

# Generated with http://pythonpsum.com
test_corpus = '''
    Object raspberrypi functools dict kwargs. Gevent raspberrypi functools. Dunder raspberrypi decorator dict didn't lambda zip import pyramid, she lambda iterate?
    Kwargs raspberrypi diversity unit object gevent. Import fall integration decorator unit django yield functools twisted. Dunder integration decorator he she future. Python raspberrypi community pypy. Kwargs integration beautiful test reduce gil python closure. Gevent he integration generator fall test kwargs raise didn't visor he itertools...
    Reduce integration coroutine bdfl he python. Cython didn't integration while beautiful list python didn't nit!
    Object fall diversity 2to3 dunder script. Python fall for: integration exception dict kwargs dunder pycon. Import raspberrypi beautiful test import six web. Future integration mercurial self script web. Return raspberrypi community test she stable.
    Django raspberrypi mercurial unit import yield raspberrypi visual rocksdahouse. Dunder raspberrypi mercurial list reduce class test scipy helmet zip?
'''

encoder = Encoder(200,
                  pct_bpe=0.88)  # params chosen for demonstration purposes
encoder.fit(test_corpus.split('\n'))

example = "Vizzini: He didn't fall? INCONCEIVABLE!"
print(encoder.tokenize(example))
# ['__sow', 'vi', 'z', 'zi', 'ni', '__eow', '__sow', ':', '__eow', 'he', 'didn', "'", 't', 'fall', '__sow', '?', '__eow', '__sow', 'in', 'co', 'n', 'ce', 'iv', 'ab', 'le', '__eow', '__sow', '!', '__eow']
print(next(encoder.transform([example])))
# [26, 108, 79, 104, 72, 24, 26, 117, 24, 9, 11, 8, 12, 10, 26, 90, 24, 26, 154, 56, 37, 149, 80, 169, 84, 24, 26, 156, 24]
print(next(encoder.inverse_transform(encoder.transform([example]))))
# vizzini : he didn ' t fall ? inconceivable !