def test_unseen_word_ending(): """ The last character should come with a </w> even if it wasn't seen as the last letter of a word in the training set. """ encoder = Encoder(pct_bpe=1, ngram_max=4) encoder.fit(test_corpus) assert encoder.tokenize('import toolz') == [SOW, 'impo', 'rt', EOW, SOW, 'tool', 'z', EOW]
def test_mixed_encoder(): encoder = Encoder(vocab_size=1000, pct_bpe=0.98, ngram_max=4) encoder.fit(test_corpus) assert encoder.tokenize('import this yield toolz') == [ 'import', SOW, 'th', 'is', EOW, SOW, 'yiel', 'd', EOW, SOW, 'tool', 'z', EOW ]
def test_bpe_encoder_fit(): """ Encoer should be able to fit to provided text data. """ encoder = Encoder(pct_bpe=1, ngram_max=4) encoder.fit(test_corpus) assert encoder.tokenize('from toolz import reduce') == [ SOW, 'f', 'ro', 'm', EOW, SOW, 'tool', 'z', EOW, SOW, 'impo', 'rt', EOW, SOW, 'redu', 'ce', EOW ]
def main(corpus_path): # type: (str) -> None """ Loads corpus, learns word and BPE vocab, and writes to stdout. Assumes corpus is line-separated text. """ with open(corpus_path) as infile: lines = list(map(str.strip, infile)) encoder = Encoder(silent=True) encoder.fit(lines) print(json.dumps(encoder.vocabs_to_dict()))
def test_encoder_creation_graceful_failure(vocab_size): """ Min vocab size is 1. Anything lower should ValueError """ died = False try: Encoder(vocab_size=vocab_size) except ValueError: died = True assert died, "Encoder should have raised a ValueError for < 1 vocab size"
def test_strict_mode(): strict_encoder = Encoder(pct_bpe=1, strict=True) strict_encoder.fit(test_corpus) failed = False idxs = [[9]] try: list(strict_encoder.inverse_transform(idxs)) except ValueError: failed = True assert failed, 'Should have failed to inverse transform word due to strict mode' non_strict_encoder = Encoder(pct_bpe=1, strict=False) non_strict_encoder.fit(test_corpus) failed = False idxs = [[9]] try: list(non_strict_encoder.inverse_transform(idxs)) except ValueError: failed = True assert not failed, 'Should not have failed to inverse transform word due to non-strict mode'
def test_unknown_char_handling(): encoder = Encoder(pct_bpe=1) encoder.fit(test_corpus) result = list(encoder.inverse_transform(encoder.transform([';'])))[0] assert encoder.UNK in result assert ';' not in result
def load(self, model_name, force_update=False): """ Use this function to automatically download and load a new model. It will automatically check online if a newer version is available. :param model_name: identifier for the model to be loaded :param force_update: use this flag to trigger forceful model update - useful if you have an unhealthy model store :return: True if the process was successful and False if something failed """ try: URL_PREFIX = "https://github.com/adobe/tripod/raw/master/data/trained/" model_prefix = os.path.join(self._model_store, model_name) must_download = force_update or not os.path.exists( model_prefix + '.best') or not os.path.exists(model_prefix + '.encodings') model_name_suffixes = ['-aa', '-ab', '-ac', '-ad'] if must_download: # download file parts for model_name_suffix in model_name_suffixes: url = "{0}{1}.zip{2}".format(URL_PREFIX, model_name, model_name_suffix) print(url) download_target = model_prefix + '.zip' + model_name_suffix self._download_with_progress_bar(url, download_target) sys.stdout.write('\n') # concatenate zip download_target = model_prefix + '.zip' f_out = open(download_target, 'wb') for model_name_suffix in model_name_suffixes: download_part = model_prefix + '.zip' + model_name_suffix f_in = open(download_part, 'rb') f_out.write(f_in.read()) f_in.close() f_out.close() zipfile = ZipFile(download_target, "r") zipfile.extractall(self._model_store) zipfile.close() sys.stdout.write("\nModel extracted successfully.") sys.stdout.flush() if os.path.exists(model_prefix + '.bpe'): self._bpe = BPEEncoder.load(model_prefix + '.bpe') self._encodings = Encodings() self._encodings.load(model_prefix + '.encodings') self._model = TripodModel2(self._encodings) self._model.load(model_prefix + '.best') self._model.to(self._device) self._model.eval() self._loaded = True return True except: return False
def test_mixed_encoder_word_in_other_word(): """ Ensure that a word is correctly decoded when it contains another word """ encoder = Encoder(vocab_size=1000, pct_bpe=0.98, ngram_max=4) encoder.fit(test_corpus) text = 'imimportport this yield toolz' idxs = list(encoder.transform([text])) idxs[0][1] = encoder.word_vocab['import'] rebuilt = next(encoder.inverse_transform(idxs)) assert rebuilt == 'import' + text[1:]
def test_common_byte_pair_collisions(): """ Ensure common byte pairs like "as" don't pull from word vocab when they are subword """ encoder = Encoder(vocab_size=200, pct_bpe=0.9, ngram_max=2) encoder.fit(test_corpus + ["as"] * 10) word = next(encoder.transform(["8 miles as the crow flies."])) assert encoder.bpe_vocab["as"] not in word assert encoder.word_vocab["as"] in word subword = next(encoder.transform(["Basted turkey legs."])) assert encoder.word_vocab["as"] not in subword assert encoder.bpe_vocab["as"] in subword
def test_fixed_length_encoding(): encoder = Encoder(pct_bpe=1, required_tokens=[PAD]) encoder.fit(test_corpus) result = list(encoder.transform([''], fixed_length=10)) assert len(result) == 1 assert len(result[0]) == 10 result = list(encoder.transform(['', 'import ' * 50], fixed_length=10)) assert len(result) == 2 assert len(result[0]) == 10 assert len(result[1]) == 10
def test_dump_and_load(): """ Should be able to dump encoder to dict, then load it again. """ encoder = Encoder(pct_bpe=1, ngram_max=4) encoder.fit(test_corpus) assert encoder.tokenize('from toolz import reduce') == [ SOW, 'f', 'ro', 'm', EOW, SOW, 'tool', 'z', EOW, SOW, 'impo', 'rt', EOW, SOW, 'redu', 'ce', EOW ] encoder_d = encoder.vocabs_to_dict() new_encoder = Encoder.from_dict(encoder_d) assert new_encoder.tokenize('from toolz import reduce') == [ SOW, 'f', 'ro', 'm', EOW, SOW, 'tool', 'z', EOW, SOW, 'impo', 'rt', EOW, SOW, 'redu', 'ce', EOW ]
def test_inverse_transform(): encoder = Encoder(pct_bpe=1) encoder.fit(test_corpus) transform = lambda text: next(encoder.inverse_transform(encoder.transform([text]))) assert transform('this is how we do it') == 'this is how we do it' assert transform('looking at the promotional stuff, it looks good.') == \ 'looking at the promotional stuff {} it looks good .'.format(UNK) assert transform('almost nothing should be recognized! let\'s see...') == \ 'almost nothing should be recognized {unk} let {unk} s see ...'.format(unk=UNK) assert transform("Vizzini: He didn't fall? INCONCEIVABLE!") == \ "vizzini {unk} he didn {unk} t fall {unk} inconceivable {unk}".format(unk=UNK)
def test_subword_tokenize(): encoder = Encoder(pct_bpe=1) encoder.fit(test_corpus) assert list(encoder.subword_tokenize('this')) == [SOW, 'th', 'is', EOW]
def test_required_tokens(): """ Should be able to require tokens to be present in encoder """ encoder = Encoder(required_tokens=['cats', 'dogs']) encoder.fit(test_corpus) assert 'cats' in encoder.word_vocab assert 'dogs' in encoder.word_vocab
def test_single_letter_tokenizing(): """ Should yield single letters when untrained """ encoder = Encoder() assert encoder.tokenize('single letters') == \ [SOW] + [UNK] * len('single') + [EOW, SOW] + [UNK] * len('letters') + [EOW]
def encoder_for_lines(lines): """ Calculate BPE encoder for provided lines of text """ encoder = Encoder(vocab_size=VOCAB_SIZE, required_tokens=[START]) encoder.fit(lines) encoder.save('latest_encoder.json') return encoder
def test_encoder_creation(vocab_size): """ Should be able to instantiate an Encoder with expected params """ Encoder(vocab_size=vocab_size)
def test_tokenize(): encoder = Encoder(pct_bpe=1) encoder.fit(test_corpus) assert list(encoder.tokenize('this is how')) == [ SOW, 'th', 'is', EOW, SOW, 'is', EOW, SOW, 'ho', 'w', EOW ]
def test_basic_transform(): encoder = Encoder(pct_bpe=1) encoder.fit(test_corpus) assert len(list(encoder.transform(['this']))[0]) == 4
def test_encoder_learning_from_random_sentences(sentences): encoder = Encoder() encoder.fit(test_corpus) encoded = encoder.transform(sentences)