Beispiel #1
0
def make_vocab(filename, vocab_size, ngram_max, pct_bpe, sep, ignore_cols, v):
    '''
    Creates word or byte-pair encoding vocabulary and mappings from a sample of
    text. Because this script will load the entire input text into memory, for
    large corpora it is recommended to use a representative sample of text.

    Vocabulary will be saved in a JSON file with the same base name as the input
    file, suffixed with "_word" or "_bpe" depending on the encoding used.
    '''

    kind = _BPE if pct_bpe else _WORD

    with open(filename, 'r') as f:
        sample = f.readlines()

    new_sep = f' {sep} ' if kind == _BPE else ' '
    sample = ['<s> ' + x.replace(' ', '_').replace(sep, new_sep) + '</s>' \
                  for i, x in enumerate(sample) if i not in ignore_cols]

    enc = Encoder(vocab_size,
                  pct_bpe=pct_bpe,
                  silent=not v,
                  ngram_max=ngram_max,
                  required_tokens={'<s>', '</s>'},
                  PAD='<pad>',
                  UNK='<unk>')
    enc.fit(sample)
    enc.vocab_size = len(enc.word_vocab) + len(enc.bpe_vocab)
    enc.mute()
    dir_, name = split(filename)
    enc.save(join(dir_, name.split('.')[0] + f'_{kind[1]}.json'))
Beispiel #2
0
class BPE(object):
    def __init__(self,
                 vocab_config,
                 file_contents=None,
                 vocab_path=None,
                 out_vocab_path='vocab'):
        if vocab_path:
            self.encoder = self.load_vocab(vocab_path)
        else:
            self.encoder = Encoder(vocab_size=32000, pct_bpe=1.0, silent=False)

    def load_vocab(self, vocab_path):
        return Encoder.load(vocab_path)

    def save_vocab(self, path):
        self.encoder.save(path)

    def tokenize(self, line):
        return self.encoder.tokenize(line)

    def vocab_key(self, w):
        UNK = self.encoder.word_vocab[self.encoder.UNK]
        return self.encoder.bpe_vocab.get(w, UNK)

    def transform(self, line):
        return list(
            itertools.chain.from_iterable(
                self.encoder.transform(line, reverse=False,
                                       fixed_length=None)))

    @property
    def vocab_dim(self):
        return len(self.encoder.bpe_vocab)
Beispiel #3
0
def run_bpe(params):
    bpe_encoder = Encoder(vocab_size=params.vocab_size,
                          pct_bpe=params.pct_bpe,
                          silent=not params.verbose)
    if params.encoder_load_file:
        sys.stdout.write('Using pre-computed BPE encoder\n')
        sys.stdout.flush()
        bpe_encoder = Encoder.load(params.encoder_load_file)
    else:
        sys.stdout.write('Generating new BPE encoder\n')
        sys.stdout.flush()
        text = open(params.source_file).read().split('\n')
        bpe_encoder.fit(text)
        bpe_encoder.save(params.encoder_save_file)
    f_src = open(params.source_file)
    f_dst = open(params.destination_file, 'w')

    for line in tqdm.tqdm(f_src.readlines()):
        line = line.strip()
        tokens = bpe_encoder.tokenize(line)
        encoded_line = ' '.join(tokens).strip()
        if encoded_line.strip() != '':
            f_dst.write(encoded_line + '\n')
    f_src.close()
    f_dst.close()
Beispiel #4
0
def bpe_encoder_for_lines(cfg: Seq2SeqConfig, lines) -> Encoder:
    """ Calculate BPE encoder for provided lines of text """
    encoder = Encoder(vocab_size=cfg.vocab_size,
                      required_tokens=[
                          cfg.start_token, AT_TOKEN, HASH_TOKEN,
                          SIGNATURE_TOKEN
                      ])
    encoder.fit(lines)
    encoder.save('latest_encoder.json')
    return encoder
Beispiel #5
0
def main():
	ap = argparse.ArgumentParser()
	ap.add_argument("data", help="Path to data file")
	ap.add_argument("-v", "--vocabulary", help="Path to output vocab file")
	args = ap.parse_args()

	encoder = Encoder(vocab_size=32000, pct_bpe=1.0)

	with open(args.data) as f:
		data = json.load(f)

	data = list(get_data(data))
	data = list(itertools.chain.from_iterable(data))
	encoder.fit(data)
	encoder.save(args.vocabulary)