def test_save_load(tmp_path): mel = 21 p = MLMPreprocessor(max_example_len=mel) p.save(tmp_path) saved = tmp_path / "cclm_config.json" p2 = MLMPreprocessor(load_from=saved) assert p2.max_example_len == mel
def test_prep_encode_string(): prep = MLMPreprocessor(max_example_len=10) prep.fit(CORPUS) my_string = CORPUS[0] example = prep.string_to_array(my_string, 5) assert ( 0 not in example ), "0 present when encoding string whose characters should all be in char_dict" assert len(example) == 5, "shape of string_to_array incorrect"
def test_unfreeze(): prep = MLMPreprocessor(max_example_len=10) prep.fit(CORPUS) base = CCLMModelBase(preprocessor=prep) mlp = MaskedLanguagePretrainer(base=base) mlp.fit(CORPUS, epochs=1) # fit easy way to build model weights mean = np.mean([ np.mean(i[0]) for i in mlp.model.get_weights() if isinstance(i[0], np.ndarray) ]) print(mean) mlp.fit(CORPUS, epochs=1) mlp.fit(CORPUS, epochs=5, print_interval=1) mean_new = np.mean([ np.mean(i[0]) for i in mlp.model.get_weights() if isinstance(i[0], np.ndarray) ]) assert mean != mean_new, "unfreeze did not work, weights remained the same"
def test_freeze(): prep = MLMPreprocessor(max_example_len=10) prep.fit(CORPUS) base = CCLMModelBase(preprocessor=prep) mlp = MaskedLanguagePretrainer(base=base) mlp.fit(CORPUS, epochs=1) mlp.freeze() mean = np.mean([ np.mean(i[0]) for i in mlp.model.get_weights() if isinstance(i[0], np.ndarray) ]) print(mean) mlp.fit(CORPUS, epochs=1) mean_new = np.mean([ np.mean(i[0]) for i in mlp.model.get_weights() if isinstance(i[0], np.ndarray) ]) assert mean == mean_new, "freeze did not work, weights changed"
import numpy as np import tensorflow as tf from tensorflow.keras.mixed_precision import experimental as mixed_precision policy = mixed_precision.Policy("mixed_float16") mixed_precision.set_policy(policy) # get AG News dataset as an example dataset dataset = load_dataset("ag_news", cache_dir="/app/cclm/.datasets") dataset_train = dataset["train"]["text"] dataset_test = dataset["test"]["text"] y_train = tf.keras.utils.to_categorical(dataset["train"]["label"]) y_test = tf.keras.utils.to_categorical(dataset["test"]["label"]) # create the preprocessor and fit it on the training set prep = MLMPreprocessor(max_example_len=1024) prep.fit(dataset_train) x_train = np.array( [prep.string_to_array(i, prep.max_example_len) for i in dataset_train]) x_test = np.array( [prep.string_to_array(i, prep.max_example_len) for i in dataset_test]) # # create a base # base = CCLMModelBase(preprocessor=prep) # # create two pretrainers that we'll combine # pretrainer_a = MaskedLanguagePretrainer( # base=base, # downsample_factor=16, # n_strided_convs=4,
def test_default_tokenizer_behavior(tmp_path): p = MLMPreprocessor() p.fit(CORPUS) assert ("string" in p.tokenizer.get_vocab() ), "fit tokenizer does not have expected tokens"
def test_prep_fit_char_dict(): prep = MLMPreprocessor() prep.fit(["a a", "b a"], min_char_freq=2) print(prep.char_dict) assert "a" in prep.char_dict, "char dict not fit properly" assert "b" not in prep.char_dict, "char dict contains characters below min value"
ap.add_argument( "--load", dest="load_existing", help="continue training from .models/ folder", action="store_true", ) args = ap.parse_args() dataset = load_dataset("wikitext", "wikitext-103-raw-v1", cache_dir="/app/cclm/.datasets") dataset = dataset["train"]["text"] if not args.load_existing: prep = MLMPreprocessor(tokenizer_path=None, max_example_len=512) prep.fit(dataset[:100000]) else: with open(".models/prep_test.pkl", "rb") as f: prep = pickle.load(f) base = CCLMModelBase(preprocessor=prep) if args.load_existing: pretrainer = MaskedLanguagePretrainer(base=base, downsample_factor=16, n_strided_convs=4, load_from=".models/mlm_test") base.embedder = tf.keras.models.load_model(".models/mlm_embedder") else: pretrainer = MaskedLanguagePretrainer(