Esempio n. 1
0
def train():
    ds_train = data.Dataset.load('imdb_train.bin')
    ds_val = data.Dataset.load('imdb_test.bin')

    # use the embedding trained on Simple English Wikipedia
    factory = TokenModelFactory(ds_train.num_classes,
                                ds_train.tokenizer.token_index,
                                max_tokens=MAX_LEN,
                                embedding_type='fasttext.wiki.simple',
                                embedding_dims=300)

    word_encoder_model = YoonKimCNN()
    # word_encoder_model = AttentionRNN()
    # word_encoder_model = StackedRNN()
    # word_encoder_model = BasicRNN()

    # freeze word embeddings
    model = factory.build_model(token_encoder_model=word_encoder_model,
                                trainable_embeddings=False)

    # use experiment.train as wrapper for Keras.fit()
    experiment.train(x=ds_train.X,
                     y=ds_train.y,
                     validation_data=(ds_val.X, ds_val.y),
                     model=model,
                     word_encoder_model=word_encoder_model)
Esempio n. 2
0
def test_train():
    X, y, _, _ = imdb(10)

    # use the special tokenizer used for constructing the embeddings
    tokenizer = SpacyTokenizer()

    # preprocess data (once)
    experiment.setup_data(X, y, tokenizer, 'data.bin', max_len=100)

    # load data
    ds = Dataset.load('data.bin')

    # construct base
    factory = TokenModelFactory(ds.num_classes,
                                ds.tokenizer.token_index,
                                max_tokens=100,
                                embedding_type='glove.6B.50d',
                                embedding_dims=50)

    # choose a model
    word_encoder_model = YoonKimCNN()

    # build a model
    model = factory.build_model(token_encoder_model=word_encoder_model,
                                trainable_embeddings=False)

    # use experiment.train as wrapper for Keras.fit()
    experiment.train(x=ds.X,
                     y=ds.y,
                     validation_split=0.1,
                     model=model,
                     word_encoder_model=word_encoder_model,
                     epochs=1,
                     batch_size=32)
Esempio n. 3
0
def train():
    ds = Dataset.load(path)
    X_train, _, y_train, _ = ds.train_val_split()

    print(ds.tokenizer.decode_texts(X_train[:10]))

    print(y_train[:10])

    # RNN models can use `max_tokens=None` to indicate variable length words per mini-batch.
    factory = TokenModelFactory(
        2, ds.tokenizer.token_index, max_tokens=max_len, embedding_type='glove.6B.300d')
    # 2, ds.tokenizer.token_index, max_tokens=max_len, embedding_type='fasttext.simple')

    word_encoder_model = YoonKimCNN()
    # word_encoder_model = AlexCNN(dropout_rate=[0, 0])
    # word_encoder_model = AttentionRNN()
    # word_encoder_model = StackedRNN()
    word_encoder_model = BasicRNN()
    model = factory.build_model(
        token_encoder_model=word_encoder_model, trainable_embeddings=False)

    model.compile(optimizer='sgd',
                  loss='categorical_crossentropy', metrics=['accuracy'])
    model.summary()

    model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.1)
Esempio n. 4
0
def test_train():
    X, y, _, _ = imdb(10)

    tokenizer = SpacyTokenizer()

    tokenizer.build_vocab(X)

    # only select top 10k tokens
    tokenizer.apply_encoding_options(limit_top_tokens=20)

    X_encoded = tokenizer.encode_texts(X)
    X_padded = tokenizer.pad_sequences(
        X_encoded, fixed_token_seq_length=max_len)

    y_cat = keras.utils.to_categorical(y, num_classes=2)

    ds = Dataset(X_padded, y_cat, tokenizer=tokenizer)

    X_train, _, y_train, _ = ds.train_val_split()

    # RNN models can use `max_tokens=None` to indicate variable length words per mini-batch.
    factory = TokenModelFactory(
        2, ds.tokenizer.token_index, max_tokens=max_len, embedding_type='glove.6B.300d')
    word_encoder_model = YoonKimCNN(dropout_rate=0)
    model = factory.build_model(
        token_encoder_model=word_encoder_model, trainable_embeddings=False)

    sgd = keras.optimizers.SGD(
        lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)

    model.compile(optimizer=sgd,
                  loss='categorical_crossentropy', metrics=['accuracy'])
    model.summary()

    model.fit(X_train, y_train, epochs=1, batch_size=32, validation_split=0.1)
def _test_build(token_encoder_model):
    test_index = {'hello': 1, 'kitty': 2}

    if token_encoder_model.allows_dynamic_length():
        factory = TokenModelFactory(1,
                                    test_index,
                                    max_tokens=None,
                                    embedding_type=None)
        model = factory.build_model(token_encoder_model)
        model.compile(optimizer='adam', loss='categorical_crossentropy')
        model.summary()
    else:
        # Should fail since this model does not allow dynamic mini-batches.
        factory = TokenModelFactory(1,
                                    test_index,
                                    max_tokens=None,
                                    embedding_type=None)
        with pytest.raises(ValueError):
            factory.build_model(token_encoder_model)

        factory = TokenModelFactory(1,
                                    test_index,
                                    max_tokens=100,
                                    embedding_type=None)
        model = factory.build_model(token_encoder_model)
        model.compile(optimizer='adam', loss='categorical_crossentropy')
        model.summary()
Esempio n. 6
0
def train(word_encoder_model, lr, batch_size, results_base_dir):
    ds = Dataset.load(proc_path)

    factory = TokenModelFactory(ds.num_classes,
                                ds.tokenizer.token_index,
                                max_tokens=max_len,
                                embedding_type="fasttext.wiki.de",
                                embedding_dims=300)

    model = factory.build_model(token_encoder_model=word_encoder_model,
                                trainable_embeddings=False)

    experiment.train(x=ds.X,
                     y=ds.y,
                     validation_split=0.1,
                     model=model,
                     word_encoder_model=word_encoder_model,
                     epochs=5)
def train():
    ds_train = data.Dataset.load('imdb_train.bin')
    ds_val = data.Dataset.load('imdb_test.bin')

    factory = TokenModelFactory(ds_train.num_classes,
                                ds_train.tokenizer.token_index,
                                max_tokens=MAX_LEN,
                                embedding_dims=EMB_DIMS,
                                embedding_type=None)

    word_encoder_model = AveragingEncoder()

    # freeze word embeddings
    model = factory.build_model(token_encoder_model=word_encoder_model,
                                trainable_embeddings=True)

    # use experiment.train as wrapper for Keras.fit()
    experiment.train(x=ds_train.X,
                     y=ds_train.y,
                     validation_data=(ds_val.X, ds_val.y),
                     model=model,
                     word_encoder_model=word_encoder_model,
                     epochs=EPOCHS)
Esempio n. 8
0
def test_train_multi_label():
    X, y = ['what is up', 'yes yes', 'no no  no'], [["foo", "bar"], ["foo"],
                                                    ["bar", "haha"]]

    # use the special tokenizer used for constructing the embeddings
    tokenizer = SimpleTokenizer()

    # preprocess data (once)
    experiment.setup_data(X, y, tokenizer, 'data.bin', max_len=100)

    # load data
    ds = Dataset.load('data.bin')

    # construct base
    factory = TokenModelFactory(ds.num_classes,
                                ds.tokenizer.token_index,
                                max_tokens=100,
                                embedding_type='glove.6B.50d',
                                embedding_dims=50)

    # choose a model
    word_encoder_model = YoonKimCNN()

    # build a model
    model = factory.build_model(token_encoder_model=word_encoder_model,
                                trainable_embeddings=False,
                                output_activation="sigmoid")

    # use experiment.train as wrapper for Keras.fit()
    experiment.train(x=ds.X,
                     y=ds.y,
                     validation_split=0.1,
                     model=model,
                     word_encoder_model=word_encoder_model,
                     epochs=1,
                     batch_size=32)