def test_spacy_tokenizer(self, lang: str): if lang == 'nn': with pytest.raises(ValueError): spacy_tok = spacy_tokenizer(lang=lang) else: spacy_tok = spacy_tokenizer(lang=lang) emoji_tokens = spacy_tok(self._emoji_sentence()) if lang == 'en': assert emoji_tokens == [ 'Hello', 'how', 'are', 'you', ',', 'with', "other", "'s", ':)' ] else: assert emoji_tokens == [ 'Hello', 'how', 'are', 'you', ',', 'with', "other's", ':)' ] no_sentence_tokens = spacy_tok(self._no_sentence()) assert no_sentence_tokens == [] whitespace_tokens = spacy_tok(self._whitespace_sentence()) assert whitespace_tokens == ['another', 'day', 'is', 'today'] more_whitespace_tokens = spacy_tok(self._serveral_whitespace()) assert more_whitespace_tokens == ['another', 'day', 'is', 'today'] comma_tokens = spacy_tok(self._comma_sentence()) assert comma_tokens == [ 'today', 'Is', 'a', 'great', ',', 'day', 'I', 'think' ]
def test_sequence_labels(self): # Test the single case test_collection = TargetTextCollection([self._target_text_example()]) test_collection.tokenize(spacy_tokenizer()) test_collection.sequence_labels() correct_sequence = ['O', 'B', 'I', 'O', 'O', 'O', 'B', 'O', 'O'] assert test_collection['2']['sequence_labels'] == correct_sequence # Test the multiple case test_collection = TargetTextCollection(self._target_text_examples()) test_collection.tokenize(spacy_tokenizer()) test_collection.sequence_labels() correct_sequence = ['O', 'O', 'O', 'O', 'O', 'O', 'B', 'O', 'O'] assert test_collection['another_id'][ 'sequence_labels'] == correct_sequence
def dataset_target_sentiment_statistics(collections: List[TargetTextCollection], lower_target: bool = True, target_key: str = 'targets', tokeniser: Callable[[str], List[str]]=spacy_tokenizer(), sentiment_key: str = 'target_sentiments', dataframe_format: bool = False, incl_sentence_statistics: bool = True ) -> Union[List[Dict[str, Union[str,int,float]]], pd.DataFrame]: ''' :param collections: A list of collections :param lower_target: Whether to lower case the targets before counting them :param target_key: The key within each sample in each collection that contains the list of targets to be analysed. This can also be the predicted target key, which might be useful for error analysis. :param tokenizer: The tokenizer to use to split the target(s) into tokens. See for a module of comptabile tokenisers :py:mod:`target_extraction.tokenizers`. This is required to give statistics on target length. :param sentiment_key: The key in each TargetText within each collection that contains the True sentiment value. :param dataframe_format: If True instead of a list of dictionaries the return will be a pandas dataframe :param incl_sentence_statistics: If False statistics about the sentence will not be included. This is so that the statistics can still be created for datasets that have been anonymised. :returns: A list of dictionaries each containing the statistics for the associated collection. Each dictionary will have the keys from :py:func:`dataset_target_extraction_statistics` and the following in addition: 1. POS (%) -- Number (Percentage) of positive targets 2. NEU (%) -- Number (Percentage) of neutral targets 3. NEG (%) -- Number (Percentage) of Negative targets ''' initial_dataset_stats = dataset_target_extraction_statistics(collections, lower_target=lower_target, target_key=target_key, tokeniser=tokeniser, dataframe_format=False, incl_sentence_statistics=incl_sentence_statistics) dataset_stats = [] for collection, collection_stats in zip(collections, initial_dataset_stats): sentiment_percent = get_sentiment_counts(collection, normalised=True, sentiment_key=sentiment_key) sentiment_percent = {sentiment_name: round((fraction * 100), 2) for sentiment_name, fraction in sentiment_percent.items()} sentiment_count = get_sentiment_counts(collection, normalised=False, sentiment_key=sentiment_key) pos_value = f'{sentiment_count["positive"]} ({sentiment_percent["positive"]})' collection_stats['POS (%)'] = pos_value neu_value = f'{sentiment_count["neutral"]} ({sentiment_percent["neutral"]})' collection_stats['NEU (%)'] = neu_value neg_value = f'{sentiment_count["negative"]} ({sentiment_percent["negative"]})' collection_stats['NEG (%)'] = neg_value dataset_stats.append(collection_stats) if dataframe_format: return _statistics_to_dataframe(dataset_stats) return dataset_stats
def test_tokenize(self): # Test the normal case with one TargetText Instance in the collection test_collection = TargetTextCollection([self._target_text_example()]) test_collection.tokenize(str.split) tokenized_answer = [ 'The', 'laptop', 'case', 'was', 'great', 'and', 'cover', 'was', 'rubbish' ] test_collection['2']['tokenized_text'] = tokenized_answer # Test the normal case with multiple TargetText Instance in the # collection test_collection = TargetTextCollection(self._target_text_examples()) test_collection.tokenize(spacy_tokenizer()) test_collection['2']['tokenized_text'] = tokenized_answer
def multi_word_targets(targets: List[str], lower: bool = True, string_delimiter: str = '_') -> Dict[str, str]: ''' :param targets: A list of targets where multi word targets will have there whitespace replaced with `_` to create a single word target. Spacy tokenizer determines multi word targets. The tokenisation happens before lower casing the target when applicable. Furthermore any target when tokenised is the same as another the later targets are not included to avoid one target to multiple multi word target mappings. :param lower: if to lower case the target words. :param string_delimiter: The string to be used to join the target words together after they have been tokenised by the spacy tokeniser. :returns: A dictionary of the original target and their multi words targets whitespace replacement version where the whitepsace is replaced with `_` e.g. {`tesco supermarket`: `tesco_supermarket`} ''' tokenizer = spacy_tokenizer() target_mapper = {} unique_targets = set() tokenized_targets = set() for target in targets: # This is done to avoid targets that are different until they are # tokenized. tokenized_target = tokenizer(target) tokenized_target = string_delimiter.join(tokenized_target) if lower: tokenized_target = tokenized_target.lower() if tokenized_target in tokenized_targets: continue tokenized_targets.add(tokenized_target) if lower: target = target.lower() unique_targets.add(target) target_mapper[target] = tokenized_target assert_err = 'The length of the multi word targets is not the same '\ 'as the non-multi-word targets' assert len(unique_targets) == len(target_mapper), assert_err return target_mapper
temp_election_directory = Path('.', 'data', 'twitter_election_dataset') train_data = wang_2017_election_twitter_train(temp_election_directory) test_data = wang_2017_election_twitter_test(temp_election_directory) if not args.model_save_dir.is_dir(): # Use the same size validation as the test data test_size = len(test_data) # Create the train and validation splits train_data = list(train_data.values()) train_data, val_data = train_test_split(train_data, test_size=test_size) train_data = TargetTextCollection(train_data) val_data = TargetTextCollection(val_data) # Tokenize the data datasets = [train_data, val_data, test_data] tokenizer = spacy_tokenizer() sizes = [] target_sizes = [] for dataset in datasets: dataset.tokenize(tokenizer) returned_errors = dataset.sequence_labels(return_errors=True) if returned_errors: for error in returned_errors: error_id = error['text_id'] del dataset[error_id] returned_errors = dataset.sequence_labels(return_errors=True) if returned_errors: raise ValueError('Sequence label errors are still persisting') sizes.append(len(dataset)) dataset: TargetTextCollection
class TestTokenizers: def _emoji_sentence(self) -> str: return "Hello how are you, with other's :)" def _no_sentence(self) -> str: return '' def _whitespace_sentence(self) -> str: return 'another day is today' def _serveral_whitespace(self) -> str: return ' another day is today ' def _comma_sentence(self) -> str: return 'today Is a great, day I think' def _difficult_tokenizer_sentence(self) -> str: return "But guess what? (you have to buy an external dvd drive." def not_char_preserving_tokenizer(self, text: str) -> List[str]: tokens = text.split() alt_tokens = [] for token in tokens: if token == "other's": alt_tokens.append('other') else: alt_tokens.append(token) return alt_tokens # This is bad coding pracice but the str.split with False value in the # actual method we replace str.split with not_char_preserving_tokenizer @pytest.mark.parametrize("tokenizer_pass", ((whitespace(), True), (spacy_tokenizer(), True), (ark_twokenize(), True), (stanford(), True), (str.split, False))) def test_is_character_preserving(self, tokenizer_pass: Tuple[Callable[[str], List[str]], bool]): tokenizer, pass_or_not = tokenizer_pass sentence = self._emoji_sentence() tokens = tokenizer(sentence) if not pass_or_not: tokens = self.not_char_preserving_tokenizer(sentence) assert is_character_preserving(sentence, tokens) == pass_or_not if pass_or_not: sentence = self._difficult_tokenizer_sentence() tokens = tokenizer(sentence) assert is_character_preserving(sentence, tokens) == True def test_whitespace(self): whitespace_tokenizer = whitespace() emoji_tokens = whitespace_tokenizer(self._emoji_sentence()) assert emoji_tokens == [ 'Hello', 'how', 'are', 'you,', 'with', "other's", ':)' ] no_sentence_tokens = whitespace_tokenizer(self._no_sentence()) assert no_sentence_tokens == [] whitespace_tokens = whitespace_tokenizer(self._whitespace_sentence()) assert whitespace_tokens == ['another', 'day', 'is', 'today'] comma_tokens = whitespace_tokenizer(self._comma_sentence()) assert comma_tokens == [ 'today', 'Is', 'a', 'great,', 'day', 'I', 'think' ] more_whitespace_tokens = whitespace_tokenizer( self._serveral_whitespace()) assert more_whitespace_tokens == ['another', 'day', 'is', 'today'] def test_ark_twokenizer(self): tokenizer = ark_twokenize() emoji_tokens = tokenizer(self._emoji_sentence()) assert emoji_tokens == [ 'Hello', 'how', 'are', 'you', ',', 'with', "other's", ':)' ] no_sentence_tokens = tokenizer(self._no_sentence()) assert no_sentence_tokens == [] whitespace_tokens = tokenizer(self._whitespace_sentence()) assert whitespace_tokens == ['another', 'day', 'is', 'today'] comma_tokens = tokenizer(self._comma_sentence()) assert comma_tokens == [ 'today', 'Is', 'a', 'great', ',', 'day', 'I', 'think' ] more_whitespace_tokens = tokenizer(self._serveral_whitespace()) assert more_whitespace_tokens == ['another', 'day', 'is', 'today'] @pytest.mark.parametrize("lang", ('en', 'de', 'nn')) def test_spacy_tokenizer(self, lang: str): if lang == 'nn': with pytest.raises(ValueError): spacy_tok = spacy_tokenizer(lang=lang) else: spacy_tok = spacy_tokenizer(lang=lang) emoji_tokens = spacy_tok(self._emoji_sentence()) if lang == 'en': assert emoji_tokens == [ 'Hello', 'how', 'are', 'you', ',', 'with', "other", "'s", ':)' ] else: assert emoji_tokens == [ 'Hello', 'how', 'are', 'you', ',', 'with', "other's", ':)' ] no_sentence_tokens = spacy_tok(self._no_sentence()) assert no_sentence_tokens == [] whitespace_tokens = spacy_tok(self._whitespace_sentence()) assert whitespace_tokens == ['another', 'day', 'is', 'today'] more_whitespace_tokens = spacy_tok(self._serveral_whitespace()) assert more_whitespace_tokens == ['another', 'day', 'is', 'today'] comma_tokens = spacy_tok(self._comma_sentence()) assert comma_tokens == [ 'today', 'Is', 'a', 'great', ',', 'day', 'I', 'think' ] @pytest.mark.parametrize("lang", ('en', 'de')) @pytest.mark.parametrize("treebank", (None, 'ewt', 'gum')) def test_stanford_tokenizer(self, lang: str, treebank: str): ''' This does not really currently test if the treebanks perform as they should i.e. we do not currently test that the English EWT treebank tokeniser is any different to the Enlgish GUM tokeniser. ''' if treebank is not None and lang == 'de': pass else: tokenizer = stanford(lang=lang, treebank=treebank) emoji_tokens = tokenizer(self._emoji_sentence()) emoji_ans = [ 'Hello', 'how', 'are', 'you', ',', 'with', "other", "'s", ':)' ] if lang == 'de': emoji_ans = [ 'Hello', 'how', 'are', 'you', ',', 'with', "other", "'s", ':', ')' ] assert emoji_tokens == emoji_ans no_sentence_tokens = tokenizer(self._no_sentence()) assert no_sentence_tokens == [] whitespace_tokens = tokenizer(self._whitespace_sentence()) assert whitespace_tokens == ['another', 'day', 'is', 'today'] more_whitespace_tokens = tokenizer(self._serveral_whitespace()) assert more_whitespace_tokens == ['another', 'day', 'is', 'today'] comma_tokens = tokenizer(self._comma_sentence()) assert comma_tokens == [ 'today', 'Is', 'a', 'great', ',', 'day', 'I', 'think' ] @pytest.mark.parametrize( "tokenizer", (whitespace(), spacy_tokenizer(), stanford(), ark_twokenize())) def test_token_index_alignment(self, tokenizer: Callable[[str], List[str]]): # Test a sentence where whitespace will be the only factor text = self._whitespace_sentence() token_indexs = [(0, 7), (8, 11), (12, 14), (15, 20)] assert token_indexs == token_index_alignment(text, tokenizer(text)) # Test a sentence where we have a comma which will cause extra # whitespace on the tokenization side text = self._comma_sentence() token_indexs = [(0, 5), (6, 8), (9, 10), (11, 16), (16, 17), (18, 21), (22, 23), (24, 29)] if tokenizer != whitespace(): assert token_indexs == token_index_alignment(text, tokenizer(text)) else: token_indexs = [(0, 5), (6, 8), (9, 10), (11, 17), (18, 21), (22, 23), (24, 29)] assert token_indexs == token_index_alignment(text, tokenizer(text)) # Test a sentence where we have multiple spaces in the text at the # start, end and in between tokens text = ' I had, great day ' token_indexs = [(2, 3), (4, 7), (7, 8), (11, 16), (17, 20)] if tokenizer != whitespace(): assert token_indexs == token_index_alignment(text, tokenizer(text)) else: token_indexs = [(2, 3), (4, 8), (11, 16), (17, 20)] assert token_indexs == token_index_alignment(text, tokenizer(text)) # Test a sentence that has multiple space commas hyphens etc. text = " I had, isn't great day doesn't'" token_indexs = [(2, 3), (4, 7), (7, 8), (10, 12), (12, 15), (17, 22), (23, 26), (28, 32), (32, 35), (35, 36)] if tokenizer != whitespace() and tokenizer != ark_twokenize(): assert token_indexs == token_index_alignment(text, tokenizer(text)) elif tokenizer == ark_twokenize(): token_indexs = [(2, 3), (4, 7), (7, 8), (10, 15), (17, 22), (23, 26), (28, 35), (35, 36)] assert token_indexs == token_index_alignment(text, tokenizer(text)) else: token_indexs = [(2, 3), (4, 8), (10, 15), (17, 22), (23, 26), (28, 36)] assert token_indexs == token_index_alignment(text, tokenizer(text))
sentiment_data_dir = Path('.', 'data', 'main_task', 'en') laptop_data_dir = Path(sentiment_data_dir, 'laptop') restaurant_data_dir = Path(sentiment_data_dir, 'restaurant') common_file_names = ['train.conll', 'dev.conll', 'test.conll'] data_dir_urls = [(restaurant_data_dir, restaurant_urls), (laptop_data_dir, laptop_urls)] for data_dir, urls in data_dir_urls: for url, file_name in zip(urls, common_file_names): downloaded_fp = cached_path(url) new_fp = Path(data_dir, file_name) new_fp.parent.mkdir(parents=True, exist_ok=True) utils.from_biose_to_bioul(Path(downloaded_fp), new_fp) mams_data_dir = Path(sentiment_data_dir, 'MAMS') mams_data_dir.mkdir(parents=True, exist_ok=True) split_names = ['train', 'val', 'test'] for split_name, file_name in zip(split_names, common_file_names): if split_name == 'train': collection = multi_aspect_multi_sentiment_atsa(split_name, original=False) else: collection = multi_aspect_multi_sentiment_atsa(split_name) collection.tokenize(spacy_tokenizer()) collection.sequence_labels(label_key='target_sentiments') conll_fp = Path(mams_data_dir, file_name) with tempfile.TemporaryDirectory() as temp_dir: temp_fp = Path(temp_dir, 'temp_file.conll') collection.to_conll_file(temp_fp, gold_label_key='sequence_labels') utils.from_bio_to_bioul(temp_fp, conll_fp)
temp_election_directory = Path('/tmp/election_dataset_dir') train_data = wang_2017_election_twitter_train( temp_election_directory) test_data = wang_2017_election_twitter_test( temp_election_directory) # Use the same size validation as the test data test_size = len(test_data) # Create the train and validation splits train_data = list(train_data.values()) train_data, val_data = train_test_split(train_data, test_size=test_size) train_data = TargetTextCollection(train_data) val_data = TargetTextCollection(val_data) # Tokenize the data datasets = [train_data, val_data, test_data] tokenizer = spacy_tokenizer() sizes = [] for dataset in datasets: dataset.tokenize(tokenizer) returned_errors = dataset.sequence_labels(return_errors=True) if returned_errors: for error in returned_errors: error_id = error['text_id'] del dataset[error_id] returned_errors = dataset.sequence_labels(return_errors=True) if returned_errors: raise ValueError('Sequence label errors are still persisting') sizes.append(len(dataset)) print( f'Lengths Train: {sizes[0]}, Validation: {sizes[1]}, Test: {sizes[2]}'
def test_exact_match_score(self): # Simple case where it should get perfect score test_collection = TargetTextCollection([self._target_text_example()]) test_collection.tokenize(spacy_tokenizer()) test_collection.sequence_labels() measures = test_collection.exact_match_score('sequence_labels') for index, measure in enumerate(measures): if index == 3: assert measure['FP'] == [] assert measure['FN'] == [] assert measure['TP'] == [('2', Span(4, 15)), ('2', Span(30, 35))] else: assert measure == 1.0 # Something that has perfect precision but misses one therefore does # not have perfect recall nor f1 test_collection = TargetTextCollection( self._target_text_measure_examples()) test_collection.tokenize(str.split) # text = 'The laptop case was great and cover was rubbish' sequence_labels_0 = ['O', 'O', 'O', 'O', 'O', 'O', 'B', 'O', 'O'] test_collection['0']['sequence_labels'] = sequence_labels_0 # text = 'The laptop price was awful' sequence_labels_1 = ['O', 'B', 'I', 'O', 'O'] test_collection['1']['sequence_labels'] = sequence_labels_1 recall, precision, f1, error_analysis = test_collection.exact_match_score( 'sequence_labels') assert precision == 1.0 assert recall == 2.0 / 3.0 assert f1 == 0.8 assert error_analysis['FP'] == [] assert error_analysis['FN'] == [('0', Span(4, 15))] assert error_analysis['TP'] == [('0', Span(30, 35)), ('1', Span(4, 16))] # Something that has perfect recall but not precision as it over # predicts sequence_labels_0 = ['O', 'B', 'I', 'B', 'O', 'O', 'B', 'O', 'O'] test_collection['0']['sequence_labels'] = sequence_labels_0 sequence_labels_1 = ['O', 'B', 'I', 'O', 'O'] test_collection['1']['sequence_labels'] = sequence_labels_1 recall, precision, f1, error_analysis = test_collection.exact_match_score( 'sequence_labels') assert precision == 3 / 4 assert recall == 1.0 assert round(f1, 3) == 0.857 assert error_analysis['FP'] == [('0', Span(16, 19))] assert error_analysis['FN'] == [] assert error_analysis['TP'] == [('0', Span(4, 15)), ('0', Span(30, 35)), ('1', Span(4, 16))] # Does not predict anything for a whole sentence therefore will have # perfect precision but bad recall (mainly testing the if not # getting anything for a sentence matters) sequence_labels_0 = ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] test_collection['0']['sequence_labels'] = sequence_labels_0 sequence_labels_1 = ['O', 'B', 'I', 'O', 'O'] test_collection['1']['sequence_labels'] = sequence_labels_1 recall, precision, f1, error_analysis = test_collection.exact_match_score( 'sequence_labels') assert precision == 1.0 assert recall == 1 / 3 assert f1 == 0.5 assert error_analysis['FP'] == [] fn_error = sorted(error_analysis['FN'], key=lambda x: x[1].start) assert fn_error == [('0', Span(4, 15)), ('0', Span(30, 35))] assert error_analysis['TP'] == [('1', Span(4, 16))] # Handle the edge case of not getting anything sequence_labels_0 = ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] test_collection['0']['sequence_labels'] = sequence_labels_0 sequence_labels_1 = ['O', 'O', 'O', 'O', 'O'] test_collection['1']['sequence_labels'] = sequence_labels_1 recall, precision, f1, error_analysis = test_collection.exact_match_score( 'sequence_labels') assert precision == 0.0 assert recall == 0.0 assert f1 == 0.0 assert error_analysis['FP'] == [] fn_error = sorted(error_analysis['FN'], key=lambda x: x[1].start) assert fn_error == [('0', Span(4, 15)), ('1', Span(4, 16)), ('0', Span(30, 35))] assert error_analysis['TP'] == [] # The case where the tokens and the text do not align not_align_example = self._target_text_not_align_example() # text = 'The laptop case; was awful' sequence_labels_align = ['O', 'B', 'I', 'O', 'O'] test_collection.add(not_align_example) test_collection.tokenize(str.split) test_collection['inf']['sequence_labels'] = sequence_labels_align sequence_labels_0 = ['O', 'B', 'I', 'O', 'O', 'O', 'B', 'O', 'O'] test_collection['0']['sequence_labels'] = sequence_labels_0 sequence_labels_1 = ['O', 'B', 'I', 'O', 'O'] test_collection['1']['sequence_labels'] = sequence_labels_1 recall, precision, f1, error_analysis = test_collection.exact_match_score( 'sequence_labels') assert recall == 3 / 4 assert precision == 3 / 4 assert f1 == 0.75 assert error_analysis['FP'] == [('inf', Span(4, 16))] assert error_analysis['FN'] == [('inf', Span(4, 15))] tp_error = sorted(error_analysis['TP'], key=lambda x: x[1].start) assert tp_error == [('0', Span(4, 15)), ('1', Span(4, 16)), ('0', Span(30, 35))] # This time it can get a perfect score as the token alignment will be # perfect test_collection.tokenize(spacy_tokenizer()) sequence_labels_align = ['O', 'B', 'I', 'O', 'O', 'O'] test_collection['inf']['sequence_labels'] = sequence_labels_align recall, precision, f1, error_analysis = test_collection.exact_match_score( 'sequence_labels') assert recall == 1.0 assert precision == 1.0 assert f1 == 1.0 assert error_analysis['FP'] == [] assert error_analysis['FN'] == [] tp_error = sorted(error_analysis['TP'], key=lambda x: x[1].end) assert tp_error == [('0', Span(4, 15)), ('inf', Span(4, 15)), ('1', Span(4, 16)), ('0', Span(30, 35))] # Handle the case where one of the samples has no spans test_example = TargetText(text="I've had a bad day", text_id='50') other_examples = self._target_text_measure_examples() other_examples.append(test_example) test_collection = TargetTextCollection(other_examples) test_collection.tokenize(str.split) test_collection.sequence_labels() measures = test_collection.exact_match_score('sequence_labels') for index, measure in enumerate(measures): if index == 3: assert measure['FP'] == [] assert measure['FN'] == [] tp_error = sorted(measure['TP'], key=lambda x: x[1].end) assert tp_error == [('0', Span(4, 15)), ('1', Span(4, 16)), ('0', Span(30, 35))] else: assert measure == 1.0 # Handle the case where on the samples has no spans but has predicted # there is a span there test_collection['50']['sequence_labels'] = ['B', 'I', 'O', 'O', 'O'] recall, precision, f1, error_analysis = test_collection.exact_match_score( 'sequence_labels') assert recall == 1.0 assert precision == 3 / 4 assert round(f1, 3) == 0.857 assert error_analysis['FP'] == [('50', Span(start=0, end=8))] assert error_analysis['FN'] == [] tp_error = sorted(error_analysis['TP'], key=lambda x: x[1].end) assert tp_error == [('0', Span(4, 15)), ('1', Span(4, 16)), ('0', Span(30, 35))] # See if it can handle a collection that only contains no spans test_example = TargetText(text="I've had a bad day", text_id='50') test_collection = TargetTextCollection([test_example]) test_collection.tokenize(str.split) test_collection.sequence_labels() measures = test_collection.exact_match_score('sequence_labels') for index, measure in enumerate(measures): if index == 3: assert measure['FP'] == [] assert measure['FN'] == [] assert measure['TP'] == [] else: assert measure == 0.0 # Handle the case the collection contains one spans but a mistake test_collection['50']['sequence_labels'] = ['B', 'I', 'O', 'O', 'O'] measures = test_collection.exact_match_score('sequence_labels') for index, measure in enumerate(measures): if index == 3: assert measure['FP'] == [('50', Span(0, 8))] assert measure['FN'] == [] assert measure['TP'] == [] else: assert measure == 0.0 # Should raise a KeyError if one of the TargetText instances does # not have a Span key del test_collection['50']._storage['spans'] with pytest.raises(KeyError): test_collection.exact_match_score('sequence_labels') # should raise a KeyError if one of the TargetText instances does # not have a predicted sequence key test_collection = TargetTextCollection([self._target_text_example()]) test_collection.tokenize(spacy_tokenizer()) test_collection.sequence_labels() with pytest.raises(KeyError): measures = test_collection.exact_match_score('nothing') # Should raise a ValueError if there are multiple same true spans a = TargetText(text='hello how are you I am good', text_id='1', targets=['hello', 'hello'], spans=[Span(0, 5), Span(0, 5)]) test_collection = TargetTextCollection([a]) test_collection.tokenize(str.split) test_collection['1']['sequence_labels'] = [ 'B', 'O', 'O', 'O', 'O', 'O', 'O' ] with pytest.raises(ValueError): test_collection.exact_match_score('sequence_labels')
def dataset_target_extraction_statistics(collections: List[TargetTextCollection], lower_target: bool = True, target_key: str = 'targets', tokeniser: Callable[[str], List[str]]=spacy_tokenizer(), dataframe_format: bool = False, incl_sentence_statistics: bool = True ) -> List[Dict[str, Union[str,int,float]]]: ''' :param collections: A list of collections :param lower_target: Whether to lower case the targets before counting them :param target_key: The key within each sample in each collection that contains the list of targets to be analysed. This can also be the predicted target key, which might be useful for error analysis. :param tokenizer: The tokenizer to use to split the target(s) into tokens. See for a module of comptabile tokenisers :py:mod:`target_extraction.tokenizers`. This is required to give statistics on target length. :param dataframe_format: If True instead of a list of dictionaries the return will be a pandas dataframe :param incl_sentence_statistics: If False statistics about the sentence will not be included. This is so that the statistics can still be created for datasets that have been anonymised. :returns: A list of dictionaries each containing the statistics for the associated collection. Each dictionary will have the following keys: 1. Name -- this comes from the collection's name attribute 2. No. Sentences -- number of sentences in the collection 3. No. Sentences(t) -- number of sentence that contain targets. 4. No. Targets -- number of targets 5. No. Uniq Targets -- number of unique targets 6. ATS -- Average Target per Sentence (ATS) 7. ATS(t) -- ATS but where all sentences in the collection must contain at least one target. 8. TL (1) -- Percentage of targets that are length 1 based on the number of tokens. 9. TL (2) -- Percentage of targets that are length 2 based on the number of tokens. 10. TL (3+) -- Percentage of targets that are length 3+ based on the number of tokens. 11. Mean Sent L -- Mean sentence length based on the tokens provided by the `tokenized_text` key in each TargetText within the collections. If this key does not exist then the collection will be tokenized using the given tokeniser argument. 12. Mean Sent L(t) -- `Mean Sent L` but where all sentences in the collection must contain at least one target. ''' dataset_stats: List[Dict[str, Union[str,int,float]]] = [] for collection in collections: collection_stats = {} collection_stats['Name'] = collection.name collection_stats['No. Sentences'] = len(collection) collection_stats['No. Sentences(t)'] = len(collection.samples_with_targets()) collection_stats['No. Targets'] = collection.number_targets() collection_stats['No. Uniq Targets'] = len(collection.target_count(lower=lower_target)) collection_stats['ATS'] = round(average_target_per_sentences(collection, False), 2) collection_stats['ATS(t)'] = round(average_target_per_sentences(collection, True), 2) target_lengths = tokens_per_target(collection, target_key, tokeniser, normalise=True) collection_stats['TL 1 %'] = round(target_lengths[1] * 100, 2) collection_stats['TL 2 %'] = round(target_lengths[2] * 100, 2) three_plus = sum([fraction for token_length, fraction in target_lengths.items() if token_length > 2]) collection_stats['TL 3+ %'] = round(three_plus * 100, 2) if not incl_sentence_statistics: dataset_stats.append(collection_stats) continue for samples_with_targets_only in [False, True]: if samples_with_targets_only: sentence_lengths = tokens_per_sentence(collection.samples_with_targets(), tokeniser) else: sentence_lengths = tokens_per_sentence(collection, tokeniser) sentence_lengths_flattened = [] for length, count in sentence_lengths.items(): sentence_lengths_flattened.extend([length] * count) mean_sentence_length = round(statistics.mean(sentence_lengths_flattened), 2) if samples_with_targets_only: collection_stats['Mean Sentence Length(t)'] = mean_sentence_length else: collection_stats['Mean Sentence Length'] = mean_sentence_length dataset_stats.append(collection_stats) if dataframe_format: return _statistics_to_dataframe(dataset_stats) return dataset_stats
def test_read_from_file(self, lazy: bool, left_right_contexts: bool, reverse_right_context: bool, incl_target: bool, target_sequences: bool, position_embeddings: bool, max_position_distance: int, position_weights: bool): # Test that a ValueError is raised if left_right_contexts is False # and incl_target is True with pytest.raises(ValueError): TargetSentimentDatasetReader(lazy=lazy, incl_target=True, left_right_contexts=False, use_categories=True) # Test that a ValueError is raised if left_right_contexts is False # and reverse_right_context is True with pytest.raises(ValueError): TargetSentimentDatasetReader(lazy=lazy, reverse_right_context=True, left_right_contexts=False, use_categories=True) # Stop ValueErrors from being raised if reverse_right_context and not left_right_contexts: return if incl_target and not left_right_contexts: return reader = TargetSentimentDatasetReader( lazy=lazy, incl_target=incl_target, left_right_contexts=left_right_contexts, reverse_right_context=reverse_right_context, use_categories=True) data_dir = Path(__file__, '..', '..', '..', 'data', 'allen', 'dataset_readers', 'target_sentiment').resolve() tokenizer = spacy_tokenizer() # Test the targets case and the include target case with respect to the # left and right contexts text1 = "I charge it at night and skip taking the cord with me "\ "because of the good battery life" tokens1 = tokenizer(text1) targets1 = ["cord", "battery life"] target_words1 = [tokenizer(target) for target in targets1] instance1 = { 'text': text1, 'text words': tokens1, 'targets': targets1, 'target words': target_words1, 'target_sentiments': ["neutral", "positive"] } if left_right_contexts: left_texts = [ "I charge it at night and skip taking the ", "I charge it at night and skip taking the cord with me because of the good " ] right_texts = [" with me because of the good battery life", ""] if incl_target: left_texts = [ "I charge it at night and skip taking the cord", "I charge it at night and skip taking the cord with me because of the good battery life" ] right_texts = [ "cord with me because of the good battery life", "battery life" ] if reverse_right_context: right_texts = ["life battery good the of because me with", ""] if incl_target: right_texts = [ "life battery good the of because me with cord", "life battery" ] instance1['left_contexts'] = [ tokenizer(text) for text in left_texts ] instance1['right_contexts'] = [ tokenizer(text) for text in right_texts ] text2 = "it is of high quality, has a killer GUI, is extremely stable, "\ "is highly expandable, is bundled with lots of very good "\ "applications, is easy to use, and is absolutely gorgeous." tokens2 = tokenizer(text2) targets2 = ["quality", "GUI", "applications", "use"] target_words2 = [tokenizer(target) for target in targets2] instance2 = { 'text': text2, 'text words': tokens2, 'targets': targets2, 'target words': target_words2, 'target_sentiments': ["positive", "positive", "positive", "positive"] } test_target_fp = Path(data_dir, 'target_sentiments.json').resolve() instances = ensure_list(reader.read(str(test_target_fp))) assert len(instances) == 2 true_instances = [instance1, instance2] for i, instance in enumerate(instances): # Only look at the left and right context of the first instance if left_right_contexts and i == 1: continue fields = instance.fields true_instance = true_instances[i] assert true_instance["text words"] == [ x.text for x in fields['tokens'] ] for index, target_field in enumerate(fields['targets']): assert true_instance["target words"][index] == [ x.text for x in target_field ] assert true_instance['target_sentiments'] == fields[ 'target_sentiments'].labels assert true_instance["text"] == fields['metadata']["text"] assert true_instance["text words"] == fields['metadata'][ "text words"] assert true_instance["targets"] == fields['metadata']["targets"] assert true_instance["target words"] == fields['metadata'][ "target words"] if left_right_contexts: for index, left_field in enumerate(fields['left_contexts']): assert true_instance["left_contexts"][index] == [ x.text for x in left_field ] for index, right_field in enumerate(fields['right_contexts']): assert true_instance["right_contexts"][index] == [ x.text for x in right_field ] assert 6 == len(fields) else: assert 4 == len(fields) # Test the categories case reader = TargetSentimentDatasetReader(lazy=lazy, incl_target=False, left_right_contexts=False, use_categories=True) text1 = "Not only was the food outstanding, but the little perks were great." tokens1 = tokenizer(text1) instance1 = { 'text': text1, 'text words': tokens1, 'categories': ["food", "service"], 'category_sentiments': ["positive", "positive"] } text2 = "To be completely fair, the only redeeming factor was the food, "\ "which was above average, but couldnt make up for all the other "\ "deficiencies of Teodora." tokens2 = tokenizer(text2) instance2 = { 'text': text2, 'text words': tokens2, 'categories': ["food", "anecdotes/miscellaneous"], 'category_sentiments': ["positive", "negative"] } test_category_fp = Path(data_dir, 'category_sentiments.json').resolve() instances = ensure_list(reader.read(str(test_category_fp))) assert len(instances) == 2 true_instances = [instance1, instance2] for i, instance in enumerate(instances): fields = instance.fields true_instance = true_instances[i] assert true_instance["text words"] == [ x.text for x in fields['tokens'] ] assert true_instance["categories"] == [ x.text for x in fields['categories'] ] assert true_instance['category_sentiments'] == fields[ 'category_sentiments'].labels assert true_instance["text"] == fields['metadata']["text"] assert true_instance["text words"] == fields['metadata'][ "text words"] assert true_instance["categories"] == fields['metadata'][ "categories"] assert 4 == len(fields) # Test the categories and target case reader = TargetSentimentDatasetReader( lazy=lazy, incl_target=False, left_right_contexts=left_right_contexts, reverse_right_context=reverse_right_context, use_categories=True) text1 = "We, there were four of us, arrived at noon - the place was "\ "empty - and the staff acted like we were imposing on them and "\ "they were very rude." tokens1 = tokenizer(text1) targets1 = ["staff"] target_words1 = [tokenizer(target) for target in targets1] instance1 = { 'text': text1, 'text words': tokens1, 'targets': targets1, 'target words': target_words1, 'categories': ["SERVICE#GENERAL", "SOMETHING"], 'target_sentiments': ["negative"] } if left_right_contexts: left_texts = [ "We, there were four of us, arrived at noon - the place was empty - and the " ] right_texts = [ " acted like we were imposing on them and they were very rude." ] if reverse_right_context: right_texts = [ ". rude very were they and them on imposing were we like acted" ] instance1['left_contexts'] = [ tokenizer(text) for text in left_texts ] instance1['right_contexts'] = [ tokenizer(text) for text in right_texts ] text2 = "The food was lousy - too sweet or too salty and the portions tiny." tokens2 = tokenizer(text2) targets2 = ["food", "portions"] target_words2 = [tokenizer(target) for target in targets2] instance2 = { 'text': text2, 'text words': tokens2, 'targets': targets2, 'target words': target_words2, 'categories': ["FOOD#QUALITY", "FOOD#STYLE_OPTIONS"], 'target_sentiments': ["negative", "negative"] } test_target_fp = Path(data_dir, 'target_category_sentiments.json').resolve() instances = ensure_list(reader.read(str(test_target_fp))) assert len(instances) == 2 true_instances = [instance1, instance2] for i, instance in enumerate(instances): # Only look at the left and right context of the first instance if left_right_contexts and i == 1: continue fields = instance.fields true_instance = true_instances[i] assert true_instance["text words"] == [ x.text for x in fields['tokens'] ] for index, target_field in enumerate(fields['targets']): assert true_instance["target words"][index] == [ x.text for x in target_field ] assert true_instance['target_sentiments'] == fields[ 'target_sentiments'].labels assert true_instance["categories"] == [ x.text for x in fields['categories'] ] assert true_instance["text"] == fields['metadata']["text"] assert true_instance["text words"] == fields['metadata'][ "text words"] assert true_instance["targets"] == fields['metadata']["targets"] assert true_instance["target words"] == fields['metadata'][ "target words"] assert true_instance["categories"] == fields['metadata'][ "categories"] if left_right_contexts: if left_right_contexts: for index, left_field in enumerate( fields['left_contexts']): assert true_instance["left_contexts"][index] == [ x.text for x in left_field ] for index, right_field in enumerate( fields['right_contexts']): assert true_instance["right_contexts"][index] == [ x.text for x in right_field ] assert 7 == len(fields) else: assert 5 == len(fields) # Test the case for the Left right contexts case where the spans are not # given reader = TargetSentimentDatasetReader( lazy=lazy, incl_target=False, left_right_contexts=left_right_contexts, reverse_right_context=reverse_right_context, use_categories=True) text_fp = Path(data_dir, 'just_text.json') with pytest.raises(ValueError): instances = ensure_list(reader.read(str(text_fp))) # Test the case for when we are not using the left right contexts # and no targets or categories are given reader = TargetSentimentDatasetReader(lazy=lazy, incl_target=False, left_right_contexts=False, reverse_right_context=False, use_categories=True) with pytest.raises(ValueError): instances = ensure_list(reader.read(str(text_fp))) # Test the target_sequences argument if left_right_contexts == True: pass elif (max_position_distance is not None and (not position_embeddings and not position_weights)): with pytest.raises(ValueError): reader = TargetSentimentDatasetReader( lazy=lazy, max_position_distance=max_position_distance, position_embeddings=position_embeddings, position_weights=position_weights) else: # Tests raises an error if the left_right_contexts is True with pytest.raises(ValueError): reader = TargetSentimentDatasetReader(lazy=lazy, left_right_contexts=True, target_sequences=True) text1 = 'The laptop case was great and awfulcover' targets1 = ['laptop case', 'case was great'] spans1 = [[4, 15], [11, 25]] if not target_sequences and not position_embeddings and not position_weights: text1 = "Thelaptopcasewas great and awfulcover" targets1 = ["laptopcase", "casewas great"] spans1 = [[3, 13], [9, 22]] tokens1 = tokenizer(text1) target_words1 = [tokenizer(target) for target in targets1] target_sequences1 = [[[0, 1, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0]], [[0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0]]] position_embedding_seq1 = [['2', '1', '1', '2', '3', '4', '5'], ['3', '2', '1', '1', '1', '2', '3']] position_weights_seq1 = [[2, 1, 1, 2, 3, 4, 5], [3, 2, 1, 1, 1, 2, 3]] instance1 = { 'text': text1, 'text words': tokens1, 'targets': targets1, 'target words': target_words1, 'spans': spans1, 'target_sequences': target_sequences1, 'target_sentiments': ["neutral", "positive"], 'position_weights': position_weights_seq1, 'position_embeddings': position_embedding_seq1 } text2 = "it is of high quality , has a killer GUI" targets2 = ["quality", "GUI"] spans2 = [[14, 21], [37, 40]] if not target_sequences and not position_embeddings and not position_weights: text2 = "it is of high quality, has a killer GUI" spans2 = [[14, 21], [36, 39]] tokens2 = tokenizer(text2) target_words2 = [tokenizer(target) for target in targets2] target_sequences2 = [[[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]], [[0, 0, 0, 0, 0, 0, 0, 0, 0, 1]]] position_embedding_seq2 = [[ '5', '4', '3', '2', '1', '2', '3', '4', '5', '6' ], ['10', '9', '8', '7', '6', '5', '4', '3', '2', '1']] position_weights_seq2 = [[5, 4, 3, 2, 1, 2, 3, 4, 5, 6], [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]] if max_position_distance is not None: position_embedding_seq2 = [[ '5', '4', '3', '2', '1', '2', '3', '4', '5', '5' ], ['5', '5', '5', '5', '5', '5', '4', '3', '2', '1']] position_weights_seq2 = [[5, 4, 3, 2, 1, 2, 3, 4, 5, 5], [5, 5, 5, 5, 5, 5, 4, 3, 2, 1]] instance2 = { 'text': text2, 'text words': tokens2, 'targets': targets2, 'target words': target_words2, 'spans': spans2, 'target_sequences': target_sequences2, 'target_sentiments': ["positive", "positive"], 'position_weights': position_weights_seq2, 'position_embeddings': position_embedding_seq2 } if not target_sequences: del instance1['target_sequences'] del instance2['target_sequences'] if not position_embeddings: del instance1['position_embeddings'] del instance2['position_embeddings'] if not position_weights: del instance1['position_weights'] del instance2['position_weights'] reader = TargetSentimentDatasetReader( lazy=lazy, target_sequences=target_sequences, position_embeddings=position_embeddings, position_weights=position_weights, max_position_distance=max_position_distance) test_target_fp = Path( data_dir, 'target_sentiment_target_sequences.json').resolve() instances = ensure_list(reader.read(str(test_target_fp))) assert len(instances) == 2 true_instances = [instance1, instance2] for i, instance in enumerate(instances): fields = instance.fields true_instance = true_instances[i] assert true_instance["text words"] == [ x.text for x in fields['tokens'] ] for index, target_field in enumerate(fields['targets']): assert true_instance["target words"][index] == [ x.text for x in target_field ] assert true_instance['target_sentiments'] == fields[ 'target_sentiments'].labels assert true_instance["text"] == fields['metadata']["text"] assert true_instance["text words"] == fields['metadata'][ "text words"] assert true_instance["targets"] == fields['metadata'][ "targets"] assert true_instance["target words"] == fields['metadata'][ "target words"] number_fields = 4 if position_embeddings: number_fields += 1 if target_sequences: number_fields += 1 if position_weights: number_fields += 1 assert number_fields == len(fields) if target_sequences: for index, target_sequence in enumerate( fields['target_sequences']): true_array = true_instance["target_sequences"][index] true_array = np.array(true_array) assert np.array_equal(true_array, target_sequence.array) if position_embeddings: for index, position_embedding_field in enumerate( fields['position_embeddings']): assert true_instance["position_embeddings"][index] == [ x.text for x in position_embedding_field ] if position_weights: position_weight_array = np.array( true_instance["position_weights"]) assert np.array_equal(position_weight_array, fields['position_weights'].array) # Ensure raises error if the max_position_distance is less than 2 with pytest.raises(ValueError): reader = TargetSentimentDatasetReader( lazy=lazy, target_sequences=target_sequences, position_embeddings=position_embeddings, position_weights=position_weights, max_position_distance=1) test_target_fp = Path( data_dir, 'target_sentiment_target_sequences.json').resolve() instances = ensure_list(reader.read(str(test_target_fp))) # Test the case of both target and category sentiments. reader = TargetSentimentDatasetReader( lazy=lazy, incl_target=False, left_right_contexts=left_right_contexts, reverse_right_context=reverse_right_context, use_categories=True) text1 = "We, there were four of us, arrived at noon - the place was "\ "empty - and the staff acted like we were imposing on them and "\ "they were very rude." tokens1 = tokenizer(text1) targets1 = ["staff"] target_words1 = [tokenizer(target) for target in targets1] instance1 = { 'text': text1, 'text words': tokens1, 'targets': targets1, 'target words': target_words1, 'categories': ["SERVICE#GENERAL", "SOMETHING", "ANOTHER"], 'target_sentiments': ["negative"], 'category_sentiments': ["positive", "positive", "negative"] } if left_right_contexts: left_texts = [ "We, there were four of us, arrived at noon - the place was empty - and the " ] right_texts = [ " acted like we were imposing on them and they were very rude." ] if reverse_right_context: right_texts = [ ". rude very were they and them on imposing were we like acted" ] instance1['left_contexts'] = [ tokenizer(text) for text in left_texts ] instance1['right_contexts'] = [ tokenizer(text) for text in right_texts ] text2 = "The food was lousy - too sweet or too salty and the portions tiny." tokens2 = tokenizer(text2) targets2 = ["food", "portions"] target_words2 = [tokenizer(target) for target in targets2] instance2 = { 'text': text2, 'text words': tokens2, 'targets': targets2, 'target words': target_words2, 'categories': ["FOOD#QUALITY", "FOOD#STYLE_OPTIONS"], 'target_sentiments': ["negative", "negative"], 'category_sentiments': ["positive", "neutral"] } test_target_fp = Path( data_dir, 'target_sentiments_category_sentiments.json').resolve() instances = ensure_list(reader.read(str(test_target_fp))) assert len(instances) == 2 true_instances = [instance1, instance2] for i, instance in enumerate(instances): # Only look at the left and right context of the first instance if left_right_contexts and i == 1: continue fields = instance.fields true_instance = true_instances[i] assert true_instance["text words"] == [ x.text for x in fields['tokens'] ] for index, target_field in enumerate(fields['targets']): assert true_instance["target words"][index] == [ x.text for x in target_field ] assert true_instance['target_sentiments'] == fields[ 'target_sentiments'].labels assert true_instance["categories"] == [ x.text for x in fields['categories'] ] assert true_instance["category_sentiments"] == fields[ 'category_sentiments'].labels assert true_instance["text"] == fields['metadata']["text"] assert true_instance["text words"] == fields['metadata'][ "text words"] assert true_instance["targets"] == fields['metadata']["targets"] assert true_instance["target words"] == fields['metadata'][ "target words"] assert true_instance["categories"] == fields['metadata'][ "categories"] if left_right_contexts: if left_right_contexts: for index, left_field in enumerate( fields['left_contexts']): assert true_instance["left_contexts"][index] == [ x.text for x in left_field ] for index, right_field in enumerate( fields['right_contexts']): assert true_instance["right_contexts"][index] == [ x.text for x in right_field ] assert 8 == len(fields) else: assert 6 == len(fields)