Exemple #1
0
def train(data_file, config, blocks, attn_dim, num_heads, nn_dim, dropout,
          tied_weights, optimizer, lr, mb, scale_residuals, block_norm, cpu):
    config = load_config(config)

    context_size = config['dataset']['maxlen']

    if 'vocab' in config['dataset']:
        vocab = Encoder.load(config['dataset']['vocab'])
        config['dataset']['vocab'] = vocab
        vocab_size = config['dataset']['vocab'].vocab_size
        pad_idx = vocab.word_vocab[vocab.PAD]
    else:
        vocab_size = 255
        pad_idx = 0

    window_batches = TimeBufferedCSVReader(data_file, **config['reader'])

    device = torch.device(
        'cuda' if torch.cuda.is_available() and not cpu else 'cpu')

    model = GPTModel(attn_dim,
                     num_heads,
                     nn_dim,
                     blocks,
                     vocab_size,
                     context_size,
                     dropout=dropout,
                     scale_res=scale_residuals,
                     block_norm=block_norm,
                     tied_weights=tied_weights,
                     device=device).to(device)

    opt = opt_map[optimizer](model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss(reduction='none', ignore_index=pad_idx)

    scores = gzip.open(basename(data_file) + '.scores.gz', 'wt')

    train_window = CSVDataset(next(window_batches), **config['dataset'])
    prev_window = window_batches.cur_window
    for eval_window in window_batches:
        eval_window = CSVDataset(eval_window, **config['dataset'])
        train_data = DataLoader(train_window,
                                shuffle=False,
                                batch_size=mb,
                                num_workers=8)
        eval_data = DataLoader(eval_window,
                               shuffle=False,
                               batch_size=mb,
                               num_workers=8)

        cur_window = window_batches.cur_window

        # train on window
        model.train()
        avg_train_loss = 0.
        batches = 0
        train_iter = tqdm(train_data)
        for b in train_iter:
            opt.zero_grad()
            _, _, seqs, _ = b
            x = seqs[:, :-1].to(device)
            y = seqs[:, 1:].to(device)
            y_mask = (y != pad_idx).float().unsqueeze(2).to(device)

            preds = model(x, mask=True, pad_key=pad_idx)

            loss = criterion(preds.transpose(1, 2), y)
            loss = loss.sum(dim=1) / y_mask.sum(dim=1).squeeze()

            loss = loss.mean()
            loss.backward()

            opt.step()

            avg_train_loss += loss.cpu().item()
            batches += 1
            train_iter.set_description(
                f'[TRAIN] window={prev_window} loss={avg_train_loss / batches:.8f}'
            )

        # evaluate on next window
        model.eval()
        avg_eval_loss = 0.
        batches = 0
        eval_iter = tqdm(eval_data)
        for b in eval_iter:
            line_nums, meta, seqs, _ = b
            x = seqs[:, :-1].to(device)
            y = seqs[:, 1:].to(device)
            y_mask = (y != pad_idx).float().unsqueeze(2).to(device)

            preds = model(x, mask=True, pad_key=pad_idx)

            loss = criterion(preds.transpose(1, 2), y)
            loss = loss.sum(dim=1) / y_mask.sum(dim=1).squeeze()

            for line_no, line_meta, line_score in zip(line_nums, meta, loss):
                scores.write(f'{line_no},{line_meta},{line_score}\n')

            loss = loss.mean()

            avg_eval_loss += loss.cpu().item()
            batches += 1
            eval_iter.set_description(
                f'[EVAL]  window={cur_window} loss={avg_eval_loss / batches:.8f}'
            )

        train_window = eval_window
        prev_window = cur_window

    scores.close()
Exemple #2
0
def run_tripod(params):
    from tripod.io_utils.io import Dataset
    from tripod.io_utils.io import Encodings
    dataset = Dataset(params.input_file)
    encodings = Encodings()
    encodings.load(params.output + '.encodings')
    model = TripodModel2(encodings)
    model.load(params.output + '.bestGST')
    model.to(params.device)
    model.eval()
    bpe_encoder = None
    if params.bpe_encoder is not None:
        dataset.sequences = []
        dataset.tokens = None
        from bpe import Encoder as BPEEncoder
        bpe_encoder = BPEEncoder.load(params.bpe_encoder)
        for line in open(params.input_file).readlines():
            dataset.sequences.append(bpe_encoder.tokenize(line))

    batches = _get_batches(dataset, params)
    token_list = ''
    with torch.no_grad():
        for batch in batches:
            for seq in batch:

                batch_x = []
                for x in seq[0]:
                    batch_x.append(x)
                tmp = batch_x[1:]

                for ii in range(len(tmp)):
                    if tmp[ii] == '<PAD>':
                        tmp = tmp[:ii]
                        break
                if bpe_encoder is not None:
                    orig = _bpe_decode(tmp, bpe_encoder)
                else:
                    orig = tmp
                batch_x = _to_tensor([batch_x], encodings, params.device)

                pred_sum = model.generate(batch_x)

                val_sum = pred_sum.cpu().numpy()

                for seq_id in range(pred_sum.shape[0]):
                    if bpe_encoder is not None:
                        token_list_sum = [
                            encodings.token_list[zz] for zz in val_sum[seq_id]
                            if zz != encodings.token2int['<UNK>']
                        ]
                        sys.stdout.write('ORIG: ' + orig + '\n\n')
                        sys.stdout.write(
                            'SUM: ' +
                            _bpe_decode(token_list_sum, bpe_encoder) + '\n\n')
                        token_list = token_list_sum
                        sys.stdout.write('=' * 20)
                        sys.stdout.write('\n\n\n')
                    else:
                        for t_id in range(pred_sum.shape[1]):
                            token_list += encodings.token_list[val_sum[seq_id]
                                                               [t_id]]
                            sys.stdout.write(
                                encodings.token_list[val_sum[seq_id][t_id]])
                            sys.stdout.flush()

                        sys.stdout.write('\n')

    with open(params.output_file, 'w') as f:
        f.write(_bpe_decode(token_list, bpe_encoder) + '\n')
        f.close()
def prepare_data(data_path,
                 freq_dist_path,
                 embedding_path,
                 vocabulary_size=10000,
                 embedding_size=200,
                 predict=False,
                 max_length=None,
                 use_bpe=False):
    max_length_provided = max_length is not None

    separator = ","
    if data_path.endswith("tsv"):
        separator = "\t"

    # construct vocabulary
    vocabulary = None
    if not use_bpe:
        with open(freq_dist_path, "rb") as freq_dist_file:
            freq_dist = pickle.load(freq_dist_file)
        vocabulary = {"<pad>": 0, "<unk>": 1, "<user>": 2, "<url>": 3}
        most_common = freq_dist.most_common(vocabulary_size - len(vocabulary))
        vocabulary.update({w[0]: i + 2 for i, w in enumerate(most_common)})
        print("Constructed vocabulary of size {}.".format(vocabulary_size))

    # load data and convert it to indices
    data = []
    labels = []
    if not max_length_provided:
        max_length = 0
    with open(data_path, "r") as data_file:
        lines = data_file.readlines()
        for i, line in enumerate(lines):
            if not predict:
                tweet_id, sentiment, tweet = line.split(separator)
            else:
                tweet_id, tweet = line.split(separator)
            data.append(tweet.strip())

            if not predict:
                labels.append(int(sentiment))
    print("Loaded data ({} tweets).".format(len(data)))

    if not use_bpe:
        new_data = []
        for tweet in data:
            words = tweet.split()
            indices = []
            for w_idx, w in enumerate(words):
                if max_length_provided and w_idx == max_length:
                    break

                index = vocabulary.get(w)
                if index is not None:
                    indices.append(index)
                else:
                    indices.append(vocabulary.get("<unk>"))

            if not max_length_provided and len(indices) > max_length:
                max_length = len(indices)

            new_data.append(indices)
        data = new_data

        pad_value = vocabulary.get("<pad>")
    else:
        print("Training BPE encoder...")
        encoder = Encoder(vocab_size=vocabulary_size,
                          required_tokens=["<user>", "<url>"],
                          UNK="<unk>",
                          PAD="<pad>")
        encoder.fit(data)
        vocabulary = encoder.vocabs_to_dict()
        print("Constructed BPE vocabulary of size {}.".format(vocabulary_size))

        new_data = []
        for tweet in data:
            indices = list(next(encoder.transform([tweet])))
            if not max_length_provided and len(indices) > max_length:
                max_length = len(indices)
            new_data.append(indices)
        data = new_data

        pad_value = encoder.word_vocab[encoder.PAD]

    # load embedding vectors
    embedding_vectors = {}
    if not use_bpe:
        with open(embedding_path, "r") as glove_file:
            for i, line in enumerate(glove_file):
                tokens = line.split()
                word = tokens[0]
                if vocabulary.get(word):
                    vector = [float(e) for e in tokens[1:]]
                    embedding_vectors[word] = np.array(vector)
        print("Found {} GLOVE vectors for vocabulary of size {}.".format(
            len(embedding_vectors), len(vocabulary)))
        print(
            "Loaded embedding vectors ({} dimensions).".format(embedding_size))

    # construct embedding matrix
    embedding_matrix = np.random.randn(vocabulary_size, embedding_size) * 0.01
    if not use_bpe:
        for word, i in list(vocabulary.items()):
            embedding_vector = embedding_vectors.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
    print("Constructed embedding matrix.")

    # pad data (might want to change max_length to be CLI argument)
    data = pad_sequences(data,
                         maxlen=max_length,
                         padding="post",
                         value=pad_value)
    if not predict:
        labels = np.array(labels)
    print("Padded sequences to length {}.".format(max_length))

    if not predict:
        return vocabulary, data, labels, embedding_matrix
    return vocabulary, data, embedding_matrix
Exemple #4
0
 def load_vocab(self, vocab_path):
     return Encoder.load(vocab_path)
Exemple #5
0
def construct_vocabulary(data: Union[str, List[Union[List[str], str]]],
                         vocabulary_size: int = 10000,
                         use_bpe: bool = False,
                         bpe_percentage: float = 0.2,
                         vocabulary_save_file: str = None) -> dict:
    counts = None
    if type(data) == str and ".pkl" in data:
        with open(data, "rb") as f:
            counts = pickle.load(f)
        if type(counts) != nltk.FreqDist:
            logger.info("Loaded vocabulary from file.")
            return counts
        elif use_bpe:
            logger.error("Cannot construct BPE vocabulary from frequency distribution file.")
            raise ValueError("Cannot construct BPE vocabulary from frequency distribution file.")
        else:
            logger.info("Constructing vocabulary from frequency distribution file.")
    elif not use_bpe:
        logger.info("Constructing vocabulary from data.")

        if type(data) == str:
            separator = ","
            if data.endswith("tsv"):
                separator = "\t"

            # load data from file
            new_data = []
            with open(data, "r") as data_file:
                lines = data_file.readlines()
                for i, line in enumerate(lines):
                    _, _, tweet = line.split(separator)
                    new_data.append(TOKENIZER.tokenize(tweet))
            data = new_data
        elif type(data[0]) != list:
            data = [TOKENIZER.tokenize(t) for t in data]

        all_words = []
        for tweet in data:
            all_words.extend(tweet)

        counts = nltk.FreqDist(all_words)

    if use_bpe:
        logger.info("Training BPE encoder...")
        encoder = Encoder(vocab_size=vocabulary_size,
                          pct_bpe=bpe_percentage,
                          word_tokenizer=lambda x: TOKENIZER.tokenize(x),
                          required_tokens=["<start>", "<extract>", "<user>", "<url>"],
                          UNK="<unk>", PAD="<pad>")
        encoder.fit(data)
        vocabulary = encoder.vocabs_to_dict()
        logger.info("Constructed BPE vocabulary of size {}.".format(vocabulary_size))
    else:
        vocabulary = {"<pad>": 0, "<unk>": 1, "<start>": 2, "<extract>": 3}
        initial_vocab_length = len(vocabulary)
        most_common = counts.most_common(vocabulary_size - initial_vocab_length)
        vocabulary.update({w[0]: i + initial_vocab_length for i, w in enumerate(most_common)})
        logger.info("Constructed embedding vocabulary of size {}.".format(len(vocabulary)))

    if vocabulary_save_file:
        if not vocabulary_save_file.endswith(".pkl"):
            vocabulary_save_file += ".pkl"
        with open(vocabulary_save_file, "wb") as f:
            pickle.dump(vocabulary, f)
        logger.info("Saved vocabulary to \"{}\".".format(vocabulary_save_file))

    return vocabulary
Exemple #6
0
from bpe import Encoder
from tqdm import tqdm

sequences = []
with open("../../datasets/multi30k/train.en") as f:
    for line in f:
        sequences.append(line.strip())

ref = [x.split() for x in sequences]
ref_len = [len(x) for x in ref]
print("REF:", max(ref_len))

def parse(x):
    return x.split()

enc = Encoder(4096, ngram_min=1, ngram_max=2, pct_bpe=0.8, silent=True, word_tokenizer=parse)
enc.fit(sequences)

base = enc.vocabs_to_dict()
duplicate_keys = []
for key in base['byte_pairs']:
    if key in base['words']:
        duplicate_keys.append(key)
if len(duplicate_keys) > 0:
    print("got duplicates:")
    print(duplicate_keys)
else:
    print("NO DUPLICATES! :)")

keybase = {**base['words'], **base['byte_pairs']}
Exemple #7
0
from bpe import Encoder

# Generated with http://pythonpsum.com
test_corpus = '''
    Object raspberrypi functools dict kwargs. Gevent raspberrypi functools. Dunder raspberrypi decorator dict didn't lambda zip import pyramid, she lambda iterate?
    Kwargs raspberrypi diversity unit object gevent. Import fall integration decorator unit django yield functools twisted. Dunder integration decorator he she future. Python raspberrypi community pypy. Kwargs integration beautiful test reduce gil python closure. Gevent he integration generator fall test kwargs raise didn't visor he itertools...
    Reduce integration coroutine bdfl he python. Cython didn't integration while beautiful list python didn't nit!
    Object fall diversity 2to3 dunder script. Python fall for: integration exception dict kwargs dunder pycon. Import raspberrypi beautiful test import six web. Future integration mercurial self script web. Return raspberrypi community test she stable.
    Django raspberrypi mercurial unit import yield raspberrypi visual rocksdahouse. Dunder raspberrypi mercurial list reduce class test scipy helmet zip?
'''

encoder = Encoder(200,
                  pct_bpe=0.88)  # params chosen for demonstration purposes
encoder.fit(test_corpus.split('\n'))

example = "Vizzini: He didn't fall? INCONCEIVABLE!"
print(encoder.tokenize(example))
# ['__sow', 'vi', 'z', 'zi', 'ni', '__eow', '__sow', ':', '__eow', 'he', 'didn', "'", 't', 'fall', '__sow', '?', '__eow', '__sow', 'in', 'co', 'n', 'ce', 'iv', 'ab', 'le', '__eow', '__sow', '!', '__eow']
print(next(encoder.transform([example])))
# [26, 108, 79, 104, 72, 24, 26, 117, 24, 9, 11, 8, 12, 10, 26, 90, 24, 26, 154, 56, 37, 149, 80, 169, 84, 24, 26, 156, 24]
print(next(encoder.inverse_transform(encoder.transform([example]))))
# vizzini : he didn ' t fall ? inconceivable !
Exemple #8
0
def read_text_data_bpe(num_samples=3000,
                       data_path="data/mt_corpus_ts.txt",
                       vocab_path='data/vocab_bpe.txt',
                       word_dropout_ratio=0.75,
                       simple_subword=False):
    """

    :param num_samples:
    :param data_path:
    :return: timesteps_max, char2id, id2char, x, x_decoder #enc_tokens, characters,
    """
    rng = random.Random(88)
    # vectorize the data
    timesteps_max = 100
    input_texts = []
    char2idx, idx2char = load_vocab_bpe(vocab_path)
    with open(vocab_path + '.dict', encoding='utf-8') as fin:
        bpe_dict = json.loads(fin.read())
    encoder = Encoder.from_dict(bpe_dict)
    encoder.word_tokenizer = WhitespaceTokenizer().tokenize
    lines = []
    print('read data from ', data_path)
    line_num = 0
    with open(data_path, encoding='utf-8') as fin:
        for line in fin:
            line_num += 1
            if line_num == 1:
                continue
            if line_num % 100000 == 0:
                print(line_num)
            if line_num > num_samples + 200:
                break
            #tuples = line.strip().split('\t')
            #zh = tuples[1]
            zh = line
            terms = zh.split()
            if len(terms) <= timesteps_max - 2:
                terms = encoder.tokenize(zh)
                # terms = [term for term in terms if term != encoder.EOW and term != encoder.SOW]
                terms = remove_seow(terms, encoder, simple_subword)
                if len(terms) <= timesteps_max - 2:
                    lines.append(terms)

    for line in lines[:min(num_samples, len(lines) - 1)]:
        input_text = line
        input_text.append("<eos>")
        input_texts.append(input_text)
    #     for char in input_text:
    #         if char not in input_characters:
    #             input_characters.add(char)
    #
    # input_characters = sorted(list(input_characters))
    num_encoder_tokens = max(idx2char.keys()) + 1
    max_encoder_seq_length = timesteps_max  #max([len(txt) for txt in input_texts]) + 1

    print("Number of samples:", len(input_texts))
    print("Number of unique input tokens:", num_encoder_tokens)
    print("Max sequence length for inputs:", max_encoder_seq_length)

    # input_token_index = dict([(char, i) for i, char in enumerate(input_characters)])
    # reverse_input_char_index = dict((i, char) for char, i in input_token_index.items())

    encoder_input_data = np.zeros((len(input_texts), max_encoder_seq_length),
                                  dtype="int32")
    decoder_input_data = np.zeros((len(input_texts), max_encoder_seq_length),
                                  dtype="int32")
    decoder_output_data = np.zeros((len(input_texts), max_encoder_seq_length),
                                   dtype="int32")
    # encoder_input_data = np.zeros((len(input_texts), max_encoder_seq_length, num_encoder_tokens), dtype="float32")
    # decoder_input_data = np.zeros((len(input_texts), max_encoder_seq_length, num_encoder_tokens), dtype="float32")

    for i, input_text in enumerate(input_texts):
        decoder_input_data[i, 0] = char2idx["<sos>"]
        # decoder_input_data[i, 0, char2idx["<sos>"]] = 1.0
        for t, char in enumerate(input_text):
            idx = char2idx[char] if char in char2idx else char2idx["<unk>"]
            idx_mask = idx
            if rng.random() < word_dropout_ratio:
                #TODO 添加一个新的单词<mask>,而不是使用<unk>
                if rng.random() < 0.9:
                    idx_mask = char2idx["<unk>"]
                else:
                    # 10% of the time, replace with random word
                    idx_mask = rng.randint(0, num_encoder_tokens - 1)
            encoder_input_data[i, t] = idx_mask
            decoder_output_data[i, t] = idx
            decoder_input_data[i, t + 1] = idx_mask
            # encoder_input_data[i, t, idx] = 1.0
            # decoder_input_data[i, t + 1, idx ] = 1.0

    return max_encoder_seq_length, char2idx, idx2char, encoder_input_data, decoder_input_data, decoder_output_data