Exemple #1
0
def nlp() -> BertPipeline:
    # Here, we do more integration like tests rather than
    # mocked unit tests. We show up how the pipeline works,
    # and it's why we use this well-defined pipeline fixture.
    name = 'absa/classifier-rest-0.1'
    tokenizer = transformers.BertTokenizer.from_pretrained(name)
    model = BertABSClassifier.from_pretrained(name)
    nlp = BertPipeline(model, tokenizer)
    return nlp
Exemple #2
0
def nlp() -> BertPipeline:
    # Here, we do more integration like tests rather than
    # mocked unit tests. We show up how the pipeline works,
    # and it's why we use this well-defined pipeline fixture.
    name = 'absa/classifier-rest-0.1'
    tokenizer = transformers.BertTokenizer.from_pretrained(name)
    # We pass a config explicitly (however, it can be downloaded automatically)
    config = BertABSCConfig.from_pretrained(name)
    model = BertABSClassifier.from_pretrained(name, config=config)
    nlp = BertPipeline(model, tokenizer)
    return nlp
Exemple #3
0
def test_sanity_classifier():
    np.random.seed(1)
    tf.random.set_seed(1)
    # This sanity test verifies and presents how train a classifier. To
    # build our model, we have to define a config, which contains all required
    # information needed to build the `BertABSClassifier` model (including
    # the BERT language model). In this example, we use default parameters
    # (which are set up for our best performance), but of course, you can pass
    # your own parameters (maybe you would be interested to change the number
    # of polarities to classify, or properties of the BERT itself).
    base_model_name = 'bert-base-uncased'
    strategy = tf.distribute.OneDeviceStrategy('CPU')
    with strategy.scope():
        config = BertABSCConfig.from_pretrained(base_model_name)
        model = BertABSClassifier.from_pretrained(base_model_name,
                                                  config=config)
        tokenizer = transformers.BertTokenizer.from_pretrained(base_model_name)
        optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-8)

    # The first step to train the model is to define a dataset. The dataset
    # can be understood as a non-differential part of the training pipeline
    # The dataset knows how to transform human-understandable example into
    # model understandable batches. You are not obligated to use datasets,
    # you can create your own iterable, which transforms classifier example
    # to the classifier train batches.
    example = LabeledExample(text='The breakfast was delicious, really great.',
                             aspect='breakfast',
                             sentiment=Sentiment.positive)
    dataset = ClassifierDataset(examples=[example, example],
                                tokenizer=tokenizer,
                                batch_size=2)

    # To easily adjust optimization process to our needs, we define custom
    # training loops called routines (in contrast to use built-in methods as
    # the `fit`). Each routine has its own optimization step wherein we can
    # control which and how parameters are updated (according to the custom
    # training paradigm presented in the TensorFlow 2.0). We iterate over a
    # dataset, perform train/test optimization steps, and collect results
    # using callbacks (which have a similar interface as the tf.keras.Callback).
    # Please take a look at the `train_classifier` function for more details.
    logger, loss_value = Logger(), LossHistory()
    train_classifier(model,
                     optimizer,
                     dataset,
                     epochs=10,
                     callbacks=[logger, loss_value],
                     strategy=strategy)

    # Our model should easily overfit in just 10 iterations.
    assert 1 < loss_value.train[1] < 2
    assert loss_value.train[10] < 2e-2

    # In the end, we would like to save the model. Our implementation
    # gentle extend the *transformers* lib capabilities, in consequences,
    # `BertABSClassifier` inherits from the `TFBertPreTrainedModel`, and
    # we can do a serialization easily.
    model.save_pretrained('.')

    # To make sure that the model serving works fine, we initialize the model
    # and the config once again. We perform the check on a single example.
    del model, config
    config = BertABSCConfig.from_pretrained('.')
    model = BertABSClassifier.from_pretrained('.', config=config)
    batch = next(iter(dataset))
    model_outputs = model.call(batch.token_ids,
                               attention_mask=batch.attention_mask,
                               token_type_ids=batch.token_type_ids)
    logits, *details = model_outputs
    loss_fn = tf.nn.softmax_cross_entropy_with_logits
    loss_value = loss_fn(batch.target_labels, logits, axis=-1, name='Loss')
    loss_value = loss_value.numpy().mean()
    assert loss_value < 2e-2

    # The training procedure is roughly verified. Now, using our tuned model,
    # we can build the `BertPipeline`. The pipeline is the high level interface
    # to perform predictions. The model should be highly confident that this is
    # the positive example (verify the softmax scores).
    nlp = BertPipeline(model, tokenizer)
    breakfast, = nlp(example.text, aspects=['breakfast'])
    assert breakfast.sentiment == Sentiment.positive
    assert np.allclose(breakfast.scores, [0.0, 0.0, 0.99], atol=0.01)

    # That's all, clean up the configuration, and the temporary saved model.
    os.remove('config.json')
    os.remove('tf_model.h5')