コード例 #1
0
ファイル: test_mlmpreprocessor.py プロジェクト: jamesmf/cclm
def test_save_load(tmp_path):
    mel = 21
    p = MLMPreprocessor(max_example_len=mel)
    p.save(tmp_path)
    saved = tmp_path / "cclm_config.json"
    p2 = MLMPreprocessor(load_from=saved)
    assert p2.max_example_len == mel
コード例 #2
0
ファイル: test_mlmpreprocessor.py プロジェクト: jamesmf/cclm
def test_prep_encode_string():
    prep = MLMPreprocessor(max_example_len=10)
    prep.fit(CORPUS)
    my_string = CORPUS[0]
    example = prep.string_to_array(my_string, 5)
    assert (
        0 not in example
    ), "0 present when encoding string whose characters should all be in char_dict"
    assert len(example) == 5, "shape of string_to_array incorrect"
コード例 #3
0
ファイル: test_pretrainer.py プロジェクト: jamesmf/cclm
def test_unfreeze():
    prep = MLMPreprocessor(max_example_len=10)
    prep.fit(CORPUS)
    base = CCLMModelBase(preprocessor=prep)
    mlp = MaskedLanguagePretrainer(base=base)
    mlp.fit(CORPUS, epochs=1)  # fit easy way to build model weights

    mean = np.mean([
        np.mean(i[0]) for i in mlp.model.get_weights()
        if isinstance(i[0], np.ndarray)
    ])
    print(mean)
    mlp.fit(CORPUS, epochs=1)
    mlp.fit(CORPUS, epochs=5, print_interval=1)
    mean_new = np.mean([
        np.mean(i[0]) for i in mlp.model.get_weights()
        if isinstance(i[0], np.ndarray)
    ])
    assert mean != mean_new, "unfreeze did not work, weights remained the same"
コード例 #4
0
ファイル: test_pretrainer.py プロジェクト: jamesmf/cclm
def test_freeze():
    prep = MLMPreprocessor(max_example_len=10)
    prep.fit(CORPUS)
    base = CCLMModelBase(preprocessor=prep)
    mlp = MaskedLanguagePretrainer(base=base)
    mlp.fit(CORPUS, epochs=1)

    mlp.freeze()
    mean = np.mean([
        np.mean(i[0]) for i in mlp.model.get_weights()
        if isinstance(i[0], np.ndarray)
    ])
    print(mean)
    mlp.fit(CORPUS, epochs=1)
    mean_new = np.mean([
        np.mean(i[0]) for i in mlp.model.get_weights()
        if isinstance(i[0], np.ndarray)
    ])
    assert mean == mean_new, "freeze did not work, weights changed"
コード例 #5
0
import numpy as np
import tensorflow as tf
from tensorflow.keras.mixed_precision import experimental as mixed_precision

policy = mixed_precision.Policy("mixed_float16")
mixed_precision.set_policy(policy)

# get AG News dataset as an example dataset
dataset = load_dataset("ag_news", cache_dir="/app/cclm/.datasets")
dataset_train = dataset["train"]["text"]
dataset_test = dataset["test"]["text"]
y_train = tf.keras.utils.to_categorical(dataset["train"]["label"])
y_test = tf.keras.utils.to_categorical(dataset["test"]["label"])

# create the preprocessor and fit it on the training set
prep = MLMPreprocessor(max_example_len=1024)
prep.fit(dataset_train)

x_train = np.array(
    [prep.string_to_array(i, prep.max_example_len) for i in dataset_train])
x_test = np.array(
    [prep.string_to_array(i, prep.max_example_len) for i in dataset_test])

# # create a base
# base = CCLMModelBase(preprocessor=prep)

# # create two pretrainers that we'll combine
# pretrainer_a = MaskedLanguagePretrainer(
#     base=base,
#     downsample_factor=16,
#     n_strided_convs=4,
コード例 #6
0
ファイル: test_mlmpreprocessor.py プロジェクト: jamesmf/cclm
def test_default_tokenizer_behavior(tmp_path):
    p = MLMPreprocessor()
    p.fit(CORPUS)
    assert ("string" in p.tokenizer.get_vocab()
            ), "fit tokenizer does not have expected tokens"
コード例 #7
0
ファイル: test_mlmpreprocessor.py プロジェクト: jamesmf/cclm
def test_prep_fit_char_dict():
    prep = MLMPreprocessor()
    prep.fit(["a a", "b a"], min_char_freq=2)
    print(prep.char_dict)
    assert "a" in prep.char_dict, "char dict not fit properly"
    assert "b" not in prep.char_dict, "char dict contains characters below min value"
コード例 #8
0
ap.add_argument(
    "--load",
    dest="load_existing",
    help="continue training from .models/ folder",
    action="store_true",
)
args = ap.parse_args()

dataset = load_dataset("wikitext",
                       "wikitext-103-raw-v1",
                       cache_dir="/app/cclm/.datasets")
dataset = dataset["train"]["text"]

if not args.load_existing:

    prep = MLMPreprocessor(tokenizer_path=None, max_example_len=512)
    prep.fit(dataset[:100000])
else:
    with open(".models/prep_test.pkl", "rb") as f:
        prep = pickle.load(f)

base = CCLMModelBase(preprocessor=prep)

if args.load_existing:
    pretrainer = MaskedLanguagePretrainer(base=base,
                                          downsample_factor=16,
                                          n_strided_convs=4,
                                          load_from=".models/mlm_test")
    base.embedder = tf.keras.models.load_model(".models/mlm_embedder")
else:
    pretrainer = MaskedLanguagePretrainer(