def test_target_extraction_fit(self, test_data: bool): model = AllenNLPModel('TE', self.CONFIG_FILE, 'target-tagger') assert model.model is None train_data = TargetTextCollection.load_json( self.TARGET_EXTRACTION_TRAIN_DATA) val_data = TargetTextCollection.load_json( self.TARGET_EXTRACTION_TRAIN_DATA) tokens_in_vocab = [ 'at', 'case', 'was', 'the', 'day', 'great', 'cover', 'office', 'another', 'and', 'rubbish', 'laptop', '@@PADDING@@', '@@UNKNOWN@@' ] if test_data: tokens_in_vocab = tokens_in_vocab + ['better'] test_data = TargetTextCollection.load_json( self.TARGET_EXTRACTION_TEST_DATA) model.fit(train_data, val_data, test_data) else: model.fit(train_data, val_data) token_index = model.model.vocab.get_token_to_index_vocabulary('tokens') assert len(token_index) == len(tokens_in_vocab) for token in tokens_in_vocab: assert token in token_index # Check attributes have changed. assert model.model is not None assert isinstance(model.model, Model) # Check that it will save to a directory of our choosing with tempfile.TemporaryDirectory() as save_dir: saved_model_fp = Path(save_dir, 'model.tar.gz') assert not saved_model_fp.exists() model = AllenNLPModel('TE', self.CONFIG_FILE, 'target-tagger', save_dir=save_dir) model.fit(train_data, val_data) assert saved_model_fp.exists()
for error in returned_errors: error_id = error['text_id'] del dataset[error_id] returned_errors = dataset.sequence_labels(return_errors=True) if returned_errors: raise ValueError('Sequence label errors are still persisting') sizes.append(len(dataset)) dataset: TargetTextCollection target_sizes.append(dataset.number_targets()) print( f'Lengths Train: {sizes[0]}, Validation: {sizes[1]}, Test: {sizes[2]}' ) print(f'Number of targets, Train: {target_sizes[0]}, Validation: ' f'{target_sizes[1]}, Test: {target_sizes[2]}') print('Fitting model') model.fit(train_data, val_data, test_data) print('Finished fitting model\nNow Evaluating model:') else: test_data.tokenize(spacy_tokenizer()) device = -1 if args.cuda: device = 0 model.load(cuda_device=device) print('Finished loading model\nNow Evaluating model:') for data in test_data.values(): data['tokens'] = data['tokenized_text'] test_iter = iter(test_data.values()) for test_pred in model.predict_sequences(test_data.values(), batch_size=args.batch_size): relevant_test = next(test_iter)