def train(): ds_train = data.Dataset.load('imdb_train.bin') ds_val = data.Dataset.load('imdb_test.bin') # use the embedding trained on Simple English Wikipedia factory = TokenModelFactory(ds_train.num_classes, ds_train.tokenizer.token_index, max_tokens=MAX_LEN, embedding_type='fasttext.wiki.simple', embedding_dims=300) word_encoder_model = YoonKimCNN() # word_encoder_model = AttentionRNN() # word_encoder_model = StackedRNN() # word_encoder_model = BasicRNN() # freeze word embeddings model = factory.build_model(token_encoder_model=word_encoder_model, trainable_embeddings=False) # use experiment.train as wrapper for Keras.fit() experiment.train(x=ds_train.X, y=ds_train.y, validation_data=(ds_val.X, ds_val.y), model=model, word_encoder_model=word_encoder_model)
def test_train(): X, y, _, _ = imdb(10) # use the special tokenizer used for constructing the embeddings tokenizer = SpacyTokenizer() # preprocess data (once) experiment.setup_data(X, y, tokenizer, 'data.bin', max_len=100) # load data ds = Dataset.load('data.bin') # construct base factory = TokenModelFactory(ds.num_classes, ds.tokenizer.token_index, max_tokens=100, embedding_type='glove.6B.50d', embedding_dims=50) # choose a model word_encoder_model = YoonKimCNN() # build a model model = factory.build_model(token_encoder_model=word_encoder_model, trainable_embeddings=False) # use experiment.train as wrapper for Keras.fit() experiment.train(x=ds.X, y=ds.y, validation_split=0.1, model=model, word_encoder_model=word_encoder_model, epochs=1, batch_size=32)
def train(): ds = Dataset.load(path) X_train, _, y_train, _ = ds.train_val_split() print(ds.tokenizer.decode_texts(X_train[:10])) print(y_train[:10]) # RNN models can use `max_tokens=None` to indicate variable length words per mini-batch. factory = TokenModelFactory( 2, ds.tokenizer.token_index, max_tokens=max_len, embedding_type='glove.6B.300d') # 2, ds.tokenizer.token_index, max_tokens=max_len, embedding_type='fasttext.simple') word_encoder_model = YoonKimCNN() # word_encoder_model = AlexCNN(dropout_rate=[0, 0]) # word_encoder_model = AttentionRNN() # word_encoder_model = StackedRNN() word_encoder_model = BasicRNN() model = factory.build_model( token_encoder_model=word_encoder_model, trainable_embeddings=False) model.compile(optimizer='sgd', loss='categorical_crossentropy', metrics=['accuracy']) model.summary() model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.1)
def test_train(): X, y, _, _ = imdb(10) tokenizer = SpacyTokenizer() tokenizer.build_vocab(X) # only select top 10k tokens tokenizer.apply_encoding_options(limit_top_tokens=20) X_encoded = tokenizer.encode_texts(X) X_padded = tokenizer.pad_sequences( X_encoded, fixed_token_seq_length=max_len) y_cat = keras.utils.to_categorical(y, num_classes=2) ds = Dataset(X_padded, y_cat, tokenizer=tokenizer) X_train, _, y_train, _ = ds.train_val_split() # RNN models can use `max_tokens=None` to indicate variable length words per mini-batch. factory = TokenModelFactory( 2, ds.tokenizer.token_index, max_tokens=max_len, embedding_type='glove.6B.300d') word_encoder_model = YoonKimCNN(dropout_rate=0) model = factory.build_model( token_encoder_model=word_encoder_model, trainable_embeddings=False) sgd = keras.optimizers.SGD( lr=0.01, decay=1e-6, momentum=0.9, nesterov=True) model.compile(optimizer=sgd, loss='categorical_crossentropy', metrics=['accuracy']) model.summary() model.fit(X_train, y_train, epochs=1, batch_size=32, validation_split=0.1)
def _test_build(token_encoder_model): test_index = {'hello': 1, 'kitty': 2} if token_encoder_model.allows_dynamic_length(): factory = TokenModelFactory(1, test_index, max_tokens=None, embedding_type=None) model = factory.build_model(token_encoder_model) model.compile(optimizer='adam', loss='categorical_crossentropy') model.summary() else: # Should fail since this model does not allow dynamic mini-batches. factory = TokenModelFactory(1, test_index, max_tokens=None, embedding_type=None) with pytest.raises(ValueError): factory.build_model(token_encoder_model) factory = TokenModelFactory(1, test_index, max_tokens=100, embedding_type=None) model = factory.build_model(token_encoder_model) model.compile(optimizer='adam', loss='categorical_crossentropy') model.summary()
def train(word_encoder_model, lr, batch_size, results_base_dir): ds = Dataset.load(proc_path) factory = TokenModelFactory(ds.num_classes, ds.tokenizer.token_index, max_tokens=max_len, embedding_type="fasttext.wiki.de", embedding_dims=300) model = factory.build_model(token_encoder_model=word_encoder_model, trainable_embeddings=False) experiment.train(x=ds.X, y=ds.y, validation_split=0.1, model=model, word_encoder_model=word_encoder_model, epochs=5)
def train(): ds_train = data.Dataset.load('imdb_train.bin') ds_val = data.Dataset.load('imdb_test.bin') factory = TokenModelFactory(ds_train.num_classes, ds_train.tokenizer.token_index, max_tokens=MAX_LEN, embedding_dims=EMB_DIMS, embedding_type=None) word_encoder_model = AveragingEncoder() # freeze word embeddings model = factory.build_model(token_encoder_model=word_encoder_model, trainable_embeddings=True) # use experiment.train as wrapper for Keras.fit() experiment.train(x=ds_train.X, y=ds_train.y, validation_data=(ds_val.X, ds_val.y), model=model, word_encoder_model=word_encoder_model, epochs=EPOCHS)
def test_train_multi_label(): X, y = ['what is up', 'yes yes', 'no no no'], [["foo", "bar"], ["foo"], ["bar", "haha"]] # use the special tokenizer used for constructing the embeddings tokenizer = SimpleTokenizer() # preprocess data (once) experiment.setup_data(X, y, tokenizer, 'data.bin', max_len=100) # load data ds = Dataset.load('data.bin') # construct base factory = TokenModelFactory(ds.num_classes, ds.tokenizer.token_index, max_tokens=100, embedding_type='glove.6B.50d', embedding_dims=50) # choose a model word_encoder_model = YoonKimCNN() # build a model model = factory.build_model(token_encoder_model=word_encoder_model, trainable_embeddings=False, output_activation="sigmoid") # use experiment.train as wrapper for Keras.fit() experiment.train(x=ds.X, y=ds.y, validation_split=0.1, model=model, word_encoder_model=word_encoder_model, epochs=1, batch_size=32)