def test_predict_into_collection(self, batch_size: Optional[int], append_if_exists: bool): # Test that it raises an Error when the model attribute is not None model_dir = self.TARGET_EXTRACTION_MODEL model = AllenNLPModel('TE', self.CONFIG_FILE, 'target-tagger', model_dir) model.load() # Test the normal case train_data = TargetTextCollection.load_json( self.TARGET_EXTRACTION_TRAIN_DATA) key_mappings = {'tags': 'predicted_tags', 'words': 'predicted_tokens'} train_data = model.predict_into_collection(train_data, key_mappings, batch_size, append_if_exists) for target_data in train_data.values(): assert 'predicted_tags' in target_data assert 'tags' not in target_data assert 'predicted_tokens' in target_data assert 'tokens' not in target_data target_tokens = target_data['tokenized_text'] assert len(target_tokens) == len(target_data['predicted_tags'][0]) assert len(target_tokens) == len( target_data['predicted_tokens'][0]) assert target_tokens == target_data['predicted_tokens'][0] # This should be fine when append_if_exists is True and KeyError other # wise. if append_if_exists: train_data = model.predict_into_collection(train_data, key_mappings, batch_size, append_if_exists) for target_data in train_data.values(): target_tokens = target_data['tokenized_text'] assert 2 == len(target_data['predicted_tags']) assert target_data['predicted_tags'][0] == target_data[ 'predicted_tags'][1] assert target_tokens == target_data['predicted_tokens'][0] assert target_tokens == target_data['predicted_tokens'][1] else: with pytest.raises(KeyError): train_data = model.predict_into_collection( train_data, key_mappings, batch_size, append_if_exists) # Raise a KeyError when the `key_mappings` values are not within the # TargetText from collections import OrderedDict key_mappings = OrderedDict([('tags', 'predicted_tags'), ('wordss', 'predicted_tokens')]) train_data = TargetTextCollection.load_json( self.TARGET_EXTRACTION_TRAIN_DATA) with pytest.raises(KeyError): train_data = model.predict_into_collection(train_data, key_mappings, batch_size, append_if_exists) for target_data in train_data.values(): assert 'predicted_tags' not in target_data assert 'predicted_tokens' not in target_data
def test_predict_iter(self): data = [{ "text": "The laptop case was great and cover was rubbish" }, { "text": "Another day at the office" }, { "text": "The laptop case was great and cover was rubbish" }] # Test that it raises an Error when the model attribute is not None model_dir = self.TARGET_EXTRACTION_MODEL model = AllenNLPModel('TE', self.CONFIG_FILE, 'target-tagger', model_dir) with pytest.raises(AssertionError): for _ in model._predict_iter(data): pass # Test that it raises an Error when the data provided is not a list or # iterable model.load() non_iter_data = 5 with pytest.raises(TypeError): for _ in model._predict_iter(non_iter_data): pass # Test that it works on the normal cases which are lists and iterables for data_type in [data, iter(data)]: predictions = [] for prediction in model._predict_iter(data_type): predictions.append(prediction) assert 3 == len(predictions) assert isinstance(predictions[0], dict) assert 5 == len(predictions[1]['tags']) assert 9 == len(predictions[1]['class_probabilities']) # Test that it works on a larger dataset of 150 larger_dataset = data * 50 for data_type in [larger_dataset, iter(larger_dataset)]: predictions = [] for prediction in model._predict_iter(data_type): predictions.append(prediction) assert 150 == len(predictions) assert isinstance(predictions[0], dict) assert 5 == len(predictions[-2]['tags']) assert 9 == len(predictions[-2]['class_probabilities']) assert 9 == len(predictions[-1]['tags']) assert 9 == len(predictions[-1]['class_probabilities']) # Test the case when you feed it no data which can happen through # multiple iterators e.g. alt_data = iter(data) # ensure alt_data has no data assert 3 == len([d for d in alt_data]) predictions = [] for prediction in model._predict_iter(alt_data): predictions.append(prediction) assert not predictions
def test_predict_sequences(self, batch_size: Optional[int]): data = [{ "text": "The laptop case was great and cover was rubbish" }, { "text": "Another day at the office" }, { "text": "The laptop case was great and cover was rubbish" }] answers = [{ "sequence_labels": ['O', 'B', 'B', 'O', 'O', 'B', 'O', 'O', 'B'], "confidence": [0, 1, 2, 3, 4, 5, 6, 7, 8], "text": "The laptop case was great and cover was rubbish", "tokens": "The laptop case was great and cover was rubbish".split() }, { "sequence_labels": ['O', 'B', 'B', 'O', 'B'], "confidence": [0, 1, 2, 3, 4], "text": "Another day at the office", "tokens": "Another day at the office".split() }, { "sequence_labels": ['O', 'B', 'B', 'O', 'O', 'B', 'O', 'O', 'B'], "confidence": [0, 1, 2, 3, 4, 5, 6, 7, 8], "text": "The laptop case was great and cover was rubbish", "tokens": "The laptop case was great and cover was rubbish".split() }] # Requires the softmax rather than the CRF version as we want the # confidence scores that are returned to be greater than # 1 / number labels where as in the CRF case it maximses entire # sentence level predictions thus the confidence returned can be less # than 1 / number labels model_dir = self.TARGET_EXTRACTION_SF_MODEL model = AllenNLPModel('TE', self.SOFTMAX_CONFIG_FILE, 'target-tagger', model_dir) model.load() predictions = [] for index, prediction in enumerate( model.predict_sequences(data, batch_size)): predictions.append(prediction) answer = answers[index] assert 4 == len(prediction) for key, value in answer.items(): assert len(value) == len(prediction[key]) if key != 'confidence': assert value == prediction[key] else: for confidence_score in prediction[key]: assert 0.333333 < confidence_score assert 1 > confidence_score
def test_load(self): model = AllenNLPModel('TE', self.CONFIG_FILE, 'target-tagger') # Test the simple case where when no save directory assertion error is # raised with pytest.raises(AssertionError): model.load() # Test the case where the save directory attribute exists but does not # have a directory with a saved model with tempfile.TemporaryDirectory() as tempdir: fake_file = Path(tempdir, 'fake file') model = AllenNLPModel('TE', self.CONFIG_FILE, 'target-tagger', fake_file) with pytest.raises(FileNotFoundError): model.load() # The success case model_dir = self.TARGET_EXTRACTION_MODEL model = AllenNLPModel('TE', self.CONFIG_FILE, 'target-tagger', model_dir) assert model.model is None same_model = model.load() assert isinstance(same_model, Model) assert model.model is not None
dataset: TargetTextCollection target_sizes.append(dataset.number_targets()) print( f'Lengths Train: {sizes[0]}, Validation: {sizes[1]}, Test: {sizes[2]}' ) print(f'Number of targets, Train: {target_sizes[0]}, Validation: ' f'{target_sizes[1]}, Test: {target_sizes[2]}') print('Fitting model') model.fit(train_data, val_data, test_data) print('Finished fitting model\nNow Evaluating model:') else: test_data.tokenize(spacy_tokenizer()) device = -1 if args.cuda: device = 0 model.load(cuda_device=device) print('Finished loading model\nNow Evaluating model:') for data in test_data.values(): data['tokens'] = data['tokenized_text'] test_iter = iter(test_data.values()) for test_pred in model.predict_sequences(test_data.values(), batch_size=args.batch_size): relevant_test = next(test_iter) relevant_test['predicted_sequence_labels'] = test_pred[ 'sequence_labels'] test_scores = test_data.exact_match_score('predicted_sequence_labels') print(f'Test F1 scores: {test_scores[2]}') first = True data_fp = args.data_fp
print( f'Lengths Train: {sizes[0]}, Validation: {sizes[1]}, Test: {sizes[2]}' ) print('Fitting model') model.fit(train_data, val_data, test_data) print('Finished fitting model\nNow Evaluating model:') test_iter = iter(test_data.values()) for test_pred in model.predict_sequences(test_data.values(), batch_size=256): relevant_test = next(test_iter) relevant_test['predicted_sequence_labels'] = test_pred[ 'sequence_labels'] test_scores = test_data.exact_match_score('predicted_sequence_labels') print(f'Test F1 scores: {test_scores[2]}') else: model.load(cuda_device=0) first = True data_fp = args.data_fp from time import time t = time() if args.number_to_predict_on: data_count = 0 with data_fp.open('r') as data_file: for line in data_file: data_count += 1 if data_count <= args.number_to_predict_on: raise ValueError( f'Number of lines in the data file {data_count} ' 'to predict on is less than or equal to the number' f' of lines to sub-sample {args.number_to_predict_on}') lines_numbers_to_subsample = random.sample(range(data_count),
def test_predict_iter(self, batch_size: Optional[int], yield_original_target: bool): data = [{ "text": "The laptop case was great and cover was rubbish" }, { "text": "Another day at the office" }, { "text": "The laptop case was great and cover was rubbish" }] # Test that it raises an Error when the model attribute is not None model_dir = self.TARGET_EXTRACTION_MODEL model = AllenNLPModel('TE', self.CONFIG_FILE, 'target-tagger', model_dir) with pytest.raises(AssertionError): for _ in model._predict_iter( data, batch_size=batch_size, yield_original_target=yield_original_target): pass # Test that it raises an Error when the data provided is not a list or # iterable model.load() non_iter_data = 5 with pytest.raises(TypeError): for _ in model._predict_iter( non_iter_data, batch_size=batch_size, yield_original_target=yield_original_target): pass # Test that it works on the normal cases which are lists and iterables for data_type in [data, iter(data)]: predictions = [] for prediction in model._predict_iter( data_type, batch_size=batch_size, yield_original_target=yield_original_target): predictions.append(prediction) assert 3 == len(predictions) predictions_0 = predictions[0] predictions_1 = predictions[1] if yield_original_target: assert isinstance(predictions_0, tuple) for pred_index, original_data_dict in enumerate(predictions): _, original_data_dict = original_data_dict assert len(data[pred_index]) == len(original_data_dict) for key, value in data[pred_index].items(): assert value == original_data_dict[key] predictions_0 = predictions_0[0] predictions_1 = predictions_1[0] assert isinstance(predictions_0, dict) assert 6 == len(predictions_1) assert 5 == len(predictions_1['tags']) assert 9 == len(predictions_1['class_probabilities']) correct_text_1 = "Another day at the office" correct_tokens_1 = correct_text_1.split() assert correct_tokens_1 == predictions_1['words'] assert correct_text_1 == predictions_1['text'] # Test that it works on a larger dataset of 150 larger_dataset = data * 50 for data_type in [larger_dataset, iter(larger_dataset)]: predictions = [] for prediction in model._predict_iter( data_type, batch_size=batch_size, yield_original_target=yield_original_target): predictions.append(prediction) assert 150 == len(predictions) predictions_0 = predictions[0] predictions_1 = predictions[-1] predictions_2 = predictions[-2] if yield_original_target: predictions_0 = predictions_0[0] predictions_1 = predictions_1[0] predictions_2 = predictions_2[0] assert isinstance(predictions_0, dict) assert 5 == len(predictions_2['tags']) assert 9 == len(predictions_2['class_probabilities']) assert 9 == len(predictions_1['tags']) assert 9 == len(predictions_1['class_probabilities']) # Test the case when you feed it no data which can happen through # multiple iterators e.g. alt_data = iter(data) # ensure alt_data has no data assert 3 == len([d for d in alt_data]) predictions = [] for prediction in model._predict_iter( alt_data, batch_size=batch_size, yield_original_target=yield_original_target): predictions.append(prediction) assert not predictions
tokenizer = spacy_tokenizer() sizes = [] for dataset in datasets: dataset.tokenize(tokenizer) dataset.sequence_labels() sizes.append(len(dataset)) print(f'Lengths {sizes[0]}, {sizes[1]}, {sizes[2]}') save_dir = Path('.', 'models', 'glove_model') param_file = Path('.', 'training_configs', 'Target_Extraction', 'General_Domain', 'Glove_LSTM_CRF.jsonnet') model = AllenNLPModel('Glove', param_file, 'target-tagger', save_dir) if not save_dir.exists(): model.fit(train_data, val_data, test_data) else: model.load() import time start_time = time.time() val_iter = iter(val_data.values()) for val_predictions in model.predict_sequences(val_data.values()): relevant_val = next(val_iter) relevant_val['predicted_sequence_labels'] = val_predictions[ 'sequence_labels'] print(time.time() - start_time) another_time = time.time() for val_predictions in model.predict_sequences(val_data.values()): pass print(time.time() - another_time) print('done') print(val_data.exact_match_score('predicted_sequence_labels')[2]) test_iter = iter(test_data.values())