Ejemplo n.º 1
0
    def test_predict_into_collection(self, batch_size: Optional[int],
                                     append_if_exists: bool):
        # Test that it raises an Error when the model attribute is not None
        model_dir = self.TARGET_EXTRACTION_MODEL
        model = AllenNLPModel('TE', self.CONFIG_FILE, 'target-tagger',
                              model_dir)
        model.load()
        # Test the normal case
        train_data = TargetTextCollection.load_json(
            self.TARGET_EXTRACTION_TRAIN_DATA)
        key_mappings = {'tags': 'predicted_tags', 'words': 'predicted_tokens'}
        train_data = model.predict_into_collection(train_data, key_mappings,
                                                   batch_size,
                                                   append_if_exists)
        for target_data in train_data.values():
            assert 'predicted_tags' in target_data
            assert 'tags' not in target_data
            assert 'predicted_tokens' in target_data
            assert 'tokens' not in target_data

            target_tokens = target_data['tokenized_text']
            assert len(target_tokens) == len(target_data['predicted_tags'][0])
            assert len(target_tokens) == len(
                target_data['predicted_tokens'][0])
            assert target_tokens == target_data['predicted_tokens'][0]
        # This should be fine when append_if_exists is True and KeyError other
        # wise.
        if append_if_exists:
            train_data = model.predict_into_collection(train_data,
                                                       key_mappings,
                                                       batch_size,
                                                       append_if_exists)
            for target_data in train_data.values():
                target_tokens = target_data['tokenized_text']
                assert 2 == len(target_data['predicted_tags'])
                assert target_data['predicted_tags'][0] == target_data[
                    'predicted_tags'][1]
                assert target_tokens == target_data['predicted_tokens'][0]
                assert target_tokens == target_data['predicted_tokens'][1]
        else:
            with pytest.raises(KeyError):
                train_data = model.predict_into_collection(
                    train_data, key_mappings, batch_size, append_if_exists)
        # Raise a KeyError when the `key_mappings` values are not within the
        # TargetText
        from collections import OrderedDict
        key_mappings = OrderedDict([('tags', 'predicted_tags'),
                                    ('wordss', 'predicted_tokens')])
        train_data = TargetTextCollection.load_json(
            self.TARGET_EXTRACTION_TRAIN_DATA)
        with pytest.raises(KeyError):
            train_data = model.predict_into_collection(train_data,
                                                       key_mappings,
                                                       batch_size,
                                                       append_if_exists)
        for target_data in train_data.values():
            assert 'predicted_tags' not in target_data
            assert 'predicted_tokens' not in target_data
    def test_predict_iter(self):
        data = [{
            "text": "The laptop case was great and cover was rubbish"
        }, {
            "text": "Another day at the office"
        }, {
            "text": "The laptop case was great and cover was rubbish"
        }]
        # Test that it raises an Error when the model attribute is not None
        model_dir = self.TARGET_EXTRACTION_MODEL
        model = AllenNLPModel('TE', self.CONFIG_FILE, 'target-tagger',
                              model_dir)
        with pytest.raises(AssertionError):
            for _ in model._predict_iter(data):
                pass
        # Test that it raises an Error when the data provided is not a list or
        # iterable
        model.load()
        non_iter_data = 5
        with pytest.raises(TypeError):
            for _ in model._predict_iter(non_iter_data):
                pass
        # Test that it works on the normal cases which are lists and iterables
        for data_type in [data, iter(data)]:
            predictions = []
            for prediction in model._predict_iter(data_type):
                predictions.append(prediction)
            assert 3 == len(predictions)
            assert isinstance(predictions[0], dict)
            assert 5 == len(predictions[1]['tags'])
            assert 9 == len(predictions[1]['class_probabilities'])

        # Test that it works on a larger dataset of 150
        larger_dataset = data * 50
        for data_type in [larger_dataset, iter(larger_dataset)]:
            predictions = []
            for prediction in model._predict_iter(data_type):
                predictions.append(prediction)
            assert 150 == len(predictions)
            assert isinstance(predictions[0], dict)
            assert 5 == len(predictions[-2]['tags'])
            assert 9 == len(predictions[-2]['class_probabilities'])
            assert 9 == len(predictions[-1]['tags'])
            assert 9 == len(predictions[-1]['class_probabilities'])

        # Test the case when you feed it no data which can happen through
        # multiple iterators e.g.
        alt_data = iter(data)
        # ensure alt_data has no data
        assert 3 == len([d for d in alt_data])
        predictions = []
        for prediction in model._predict_iter(alt_data):
            predictions.append(prediction)
        assert not predictions
Ejemplo n.º 3
0
 def test_predict_sequences(self, batch_size: Optional[int]):
     data = [{
         "text": "The laptop case was great and cover was rubbish"
     }, {
         "text": "Another day at the office"
     }, {
         "text": "The laptop case was great and cover was rubbish"
     }]
     answers = [{
         "sequence_labels": ['O', 'B', 'B', 'O', 'O', 'B', 'O', 'O', 'B'],
         "confidence": [0, 1, 2, 3, 4, 5, 6, 7, 8],
         "text":
         "The laptop case was great and cover was rubbish",
         "tokens":
         "The laptop case was great and cover was rubbish".split()
     }, {
         "sequence_labels": ['O', 'B', 'B', 'O', 'B'],
         "confidence": [0, 1, 2, 3, 4],
         "text": "Another day at the office",
         "tokens": "Another day at the office".split()
     }, {
         "sequence_labels": ['O', 'B', 'B', 'O', 'O', 'B', 'O', 'O', 'B'],
         "confidence": [0, 1, 2, 3, 4, 5, 6, 7, 8],
         "text":
         "The laptop case was great and cover was rubbish",
         "tokens":
         "The laptop case was great and cover was rubbish".split()
     }]
     # Requires the softmax rather than the CRF version as we want the
     # confidence scores that are returned to be greater than
     # 1 / number labels where as in the CRF case it maximses entire
     # sentence level predictions thus the confidence returned can be less
     # than 1 / number labels
     model_dir = self.TARGET_EXTRACTION_SF_MODEL
     model = AllenNLPModel('TE', self.SOFTMAX_CONFIG_FILE, 'target-tagger',
                           model_dir)
     model.load()
     predictions = []
     for index, prediction in enumerate(
             model.predict_sequences(data, batch_size)):
         predictions.append(prediction)
         answer = answers[index]
         assert 4 == len(prediction)
         for key, value in answer.items():
             assert len(value) == len(prediction[key])
             if key != 'confidence':
                 assert value == prediction[key]
             else:
                 for confidence_score in prediction[key]:
                     assert 0.333333 < confidence_score
                     assert 1 > confidence_score
Ejemplo n.º 4
0
    def test_load(self):
        model = AllenNLPModel('TE', self.CONFIG_FILE, 'target-tagger')
        # Test the simple case where when no save directory assertion error is
        # raised
        with pytest.raises(AssertionError):
            model.load()
        # Test the case where the save directory attribute exists but does not
        # have a directory with a saved model
        with tempfile.TemporaryDirectory() as tempdir:
            fake_file = Path(tempdir, 'fake file')
            model = AllenNLPModel('TE', self.CONFIG_FILE, 'target-tagger',
                                  fake_file)
            with pytest.raises(FileNotFoundError):
                model.load()
        # The success case
        model_dir = self.TARGET_EXTRACTION_MODEL
        model = AllenNLPModel('TE', self.CONFIG_FILE, 'target-tagger',
                              model_dir)
        assert model.model is None

        same_model = model.load()
        assert isinstance(same_model, Model)
        assert model.model is not None
Ejemplo n.º 5
0
            dataset: TargetTextCollection
            target_sizes.append(dataset.number_targets())
        print(
            f'Lengths Train: {sizes[0]}, Validation: {sizes[1]}, Test: {sizes[2]}'
        )
        print(f'Number of targets, Train: {target_sizes[0]}, Validation: '
              f'{target_sizes[1]}, Test: {target_sizes[2]}')
        print('Fitting model')
        model.fit(train_data, val_data, test_data)
        print('Finished fitting model\nNow Evaluating model:')
    else:
        test_data.tokenize(spacy_tokenizer())
        device = -1
        if args.cuda:
            device = 0
        model.load(cuda_device=device)
        print('Finished loading model\nNow Evaluating model:')

    for data in test_data.values():
        data['tokens'] = data['tokenized_text']
    test_iter = iter(test_data.values())
    for test_pred in model.predict_sequences(test_data.values(),
                                             batch_size=args.batch_size):
        relevant_test = next(test_iter)
        relevant_test['predicted_sequence_labels'] = test_pred[
            'sequence_labels']
    test_scores = test_data.exact_match_score('predicted_sequence_labels')
    print(f'Test F1 scores: {test_scores[2]}')

    first = True
    data_fp = args.data_fp
Ejemplo n.º 6
0
     print(
         f'Lengths Train: {sizes[0]}, Validation: {sizes[1]}, Test: {sizes[2]}'
     )
     print('Fitting model')
     model.fit(train_data, val_data, test_data)
     print('Finished fitting model\nNow Evaluating model:')
     test_iter = iter(test_data.values())
     for test_pred in model.predict_sequences(test_data.values(),
                                              batch_size=256):
         relevant_test = next(test_iter)
         relevant_test['predicted_sequence_labels'] = test_pred[
             'sequence_labels']
     test_scores = test_data.exact_match_score('predicted_sequence_labels')
     print(f'Test F1 scores: {test_scores[2]}')
 else:
     model.load(cuda_device=0)
 first = True
 data_fp = args.data_fp
 from time import time
 t = time()
 if args.number_to_predict_on:
     data_count = 0
     with data_fp.open('r') as data_file:
         for line in data_file:
             data_count += 1
     if data_count <= args.number_to_predict_on:
         raise ValueError(
             f'Number of lines in the data file {data_count} '
             'to predict on is less than or equal to the number'
             f' of lines to sub-sample {args.number_to_predict_on}')
     lines_numbers_to_subsample = random.sample(range(data_count),
Ejemplo n.º 7
0
    def test_predict_iter(self, batch_size: Optional[int],
                          yield_original_target: bool):
        data = [{
            "text": "The laptop case was great and cover was rubbish"
        }, {
            "text": "Another day at the office"
        }, {
            "text": "The laptop case was great and cover was rubbish"
        }]
        # Test that it raises an Error when the model attribute is not None
        model_dir = self.TARGET_EXTRACTION_MODEL
        model = AllenNLPModel('TE', self.CONFIG_FILE, 'target-tagger',
                              model_dir)
        with pytest.raises(AssertionError):
            for _ in model._predict_iter(
                    data,
                    batch_size=batch_size,
                    yield_original_target=yield_original_target):
                pass
        # Test that it raises an Error when the data provided is not a list or
        # iterable
        model.load()
        non_iter_data = 5
        with pytest.raises(TypeError):
            for _ in model._predict_iter(
                    non_iter_data,
                    batch_size=batch_size,
                    yield_original_target=yield_original_target):
                pass
        # Test that it works on the normal cases which are lists and iterables
        for data_type in [data, iter(data)]:
            predictions = []
            for prediction in model._predict_iter(
                    data_type,
                    batch_size=batch_size,
                    yield_original_target=yield_original_target):
                predictions.append(prediction)
            assert 3 == len(predictions)
            predictions_0 = predictions[0]
            predictions_1 = predictions[1]

            if yield_original_target:
                assert isinstance(predictions_0, tuple)
                for pred_index, original_data_dict in enumerate(predictions):
                    _, original_data_dict = original_data_dict
                    assert len(data[pred_index]) == len(original_data_dict)
                    for key, value in data[pred_index].items():
                        assert value == original_data_dict[key]
                predictions_0 = predictions_0[0]
                predictions_1 = predictions_1[0]
            assert isinstance(predictions_0, dict)
            assert 6 == len(predictions_1)
            assert 5 == len(predictions_1['tags'])
            assert 9 == len(predictions_1['class_probabilities'])

            correct_text_1 = "Another day at the office"
            correct_tokens_1 = correct_text_1.split()
            assert correct_tokens_1 == predictions_1['words']
            assert correct_text_1 == predictions_1['text']

        # Test that it works on a larger dataset of 150
        larger_dataset = data * 50
        for data_type in [larger_dataset, iter(larger_dataset)]:
            predictions = []
            for prediction in model._predict_iter(
                    data_type,
                    batch_size=batch_size,
                    yield_original_target=yield_original_target):
                predictions.append(prediction)
            assert 150 == len(predictions)
            predictions_0 = predictions[0]
            predictions_1 = predictions[-1]
            predictions_2 = predictions[-2]
            if yield_original_target:
                predictions_0 = predictions_0[0]
                predictions_1 = predictions_1[0]
                predictions_2 = predictions_2[0]
            assert isinstance(predictions_0, dict)
            assert 5 == len(predictions_2['tags'])
            assert 9 == len(predictions_2['class_probabilities'])
            assert 9 == len(predictions_1['tags'])
            assert 9 == len(predictions_1['class_probabilities'])

        # Test the case when you feed it no data which can happen through
        # multiple iterators e.g.
        alt_data = iter(data)
        # ensure alt_data has no data
        assert 3 == len([d for d in alt_data])
        predictions = []
        for prediction in model._predict_iter(
                alt_data,
                batch_size=batch_size,
                yield_original_target=yield_original_target):
            predictions.append(prediction)
        assert not predictions
Ejemplo n.º 8
0
tokenizer = spacy_tokenizer()
sizes = []
for dataset in datasets:
    dataset.tokenize(tokenizer)
    dataset.sequence_labels()
    sizes.append(len(dataset))
print(f'Lengths {sizes[0]}, {sizes[1]}, {sizes[2]}')
save_dir = Path('.', 'models', 'glove_model')
param_file = Path('.', 'training_configs', 'Target_Extraction',
                  'General_Domain', 'Glove_LSTM_CRF.jsonnet')
model = AllenNLPModel('Glove', param_file, 'target-tagger', save_dir)

if not save_dir.exists():
    model.fit(train_data, val_data, test_data)
else:
    model.load()
import time
start_time = time.time()
val_iter = iter(val_data.values())
for val_predictions in model.predict_sequences(val_data.values()):
    relevant_val = next(val_iter)
    relevant_val['predicted_sequence_labels'] = val_predictions[
        'sequence_labels']
print(time.time() - start_time)
another_time = time.time()
for val_predictions in model.predict_sequences(val_data.values()):
    pass
print(time.time() - another_time)
print('done')
print(val_data.exact_match_score('predicted_sequence_labels')[2])
test_iter = iter(test_data.values())