def test_not_semeval_file(self, conflict: bool):
        # Test that it will raise a SyntaxError as the file does not follow 
        # SemEval format.

        unreadable_fp = Path(self.DATA_PATH_DIR, 
                             'not_semeval_14_example.xml')
        with pytest.raises(SyntaxError):
            semeval_2014(unreadable_fp, conflict)
    def test_unreadable_file(self, conflict: bool):
        # Test that it will raise a ParseError is not formatted correctly e.g. 
        # Contains mismatched tags

        unreadable_fp = Path(self.DATA_PATH_DIR, 
                             'unpassable_semeval_14_example.xml')
        with pytest.raises(ParseError):
            semeval_2014(unreadable_fp, conflict)
    def test_read_from_file(self, conflict: bool):
        data_fp = Path(self.DATA_PATH_DIR, 'semeval_14_example.xml')
        target_text_collection = semeval_2014(data_fp, conflict)

        assert len(target_text_collection) == 5

        _ids = ['3121', '2777', '2534', '2634', '1793']
        assert list(target_text_collection.keys()) == _ids
        true_answers = self._target_answer(conflict=conflict)
        for answer_key, answer in true_answers.items():
            test_answer = target_text_collection[answer_key]
            for key in answer:
                assert answer[key] == test_answer[key]
Esempio n. 4
0
                        help='File Path to the output predictions')
    args = parser.parse_args()

    dataset_name = args.dataset_name
    model_name = f'{dataset_name} model'
    model = AllenNLPModel(model_name, args.model_config, 'target-tagger',
                          args.model_save_dir)

    if dataset_name == 'semeval_2014':
        if not args.train_fp or not args.test_fp:
            raise ValueError('If training and predicting for the SemEval '
                             'datasets the training and test file paths must '
                             'be given')
        # As we are performing target extraction we use the conflict polarity
        # targets like prior work
        train_data = semeval_2014(args.train_fp, conflict=True)
        test_data = semeval_2014(args.test_fp, conflict=True)
    else:
        temp_election_directory = Path('.', 'data', 'twitter_election_dataset')
        train_data = wang_2017_election_twitter_train(temp_election_directory)
        test_data = wang_2017_election_twitter_test(temp_election_directory)

    if not args.model_save_dir.is_dir():
        # Use the same size validation as the test data
        test_size = len(test_data)
        # Create the train and validation splits
        train_data = list(train_data.values())
        train_data, val_data = train_test_split(train_data,
                                                test_size=test_size)
        train_data = TargetTextCollection(train_data)
        val_data = TargetTextCollection(val_data)
for dataset_name in dataset_names:
    for split_name in split_names:
        dataset_fp = config.neural_dataset_dir / f'{dataset_name} {split_name}.json'
        if dataset_fp.exists():
            continue
        dataset = None
        if dataset_name == 'Election':
            if split_name == 'train':
                election_train = parsers.election_train(config.ELECTION,
                                                        name='Election Train')
                print(f'Number of targets before {len(election_train)}')
                election_train = get_targets_from_spans(election_train)
                temp_fp = config.neural_dataset_dir / 'Temp Election train.xml'
                write_data.semeval_14(temp_fp, election_train)
                dataset = semeval_2014(temp_fp, conflict=False)
            else:
                election_test = parsers.election_test(config.ELECTION,
                                                      name='Election Train')
                print(f'Number of targets before {len(election_test)}')
                election_test = get_targets_from_spans(election_test)
                temp_fp = config.neural_dataset_dir / 'Temp Election test.xml'
                write_data.semeval_14(temp_fp, election_test)
                dataset = semeval_2014(temp_fp, conflict=False)
        elif dataset_name == 'Dong':
            dong_dataset = parsers.dong(
                dataset_fp_mapper[f'{dataset_name} {split_name}'])
            print(f'Number of targets before {len(dong_dataset)}')
            new_dong_dataset = []
            for value in dong_dataset.data_dict():
                target_spans = value['spans']
Esempio n. 6
0
    args = parser.parse_args()

    dataset_name = args.dataset_name
    model_name = f'{dataset_name} model'
    model = AllenNLPModel(model_name, args.model_config, 'target-tagger',
                          args.model_save_dir)

    if not args.model_save_dir.is_dir():

        if dataset_name == 'semeval_2014':
            if not args.train_fp or not args.test_fp:
                raise ValueError(
                    'If training and predicting for the SemEval '
                    'datasets the training and test file paths must '
                    'be given')
            train_data = semeval_2014(args.train_fp, False)
            test_data = semeval_2014(args.test_fp, False)
        else:
            temp_election_directory = Path('/tmp/election_dataset_dir')
            train_data = wang_2017_election_twitter_train(
                temp_election_directory)
            test_data = wang_2017_election_twitter_test(
                temp_election_directory)
        # Use the same size validation as the test data
        test_size = len(test_data)
        # Create the train and validation splits
        train_data = list(train_data.values())
        train_data, val_data = train_test_split(train_data,
                                                test_size=test_size)
        train_data = TargetTextCollection(train_data)
        val_data = TargetTextCollection(val_data)
Esempio n. 7
0
from target_extraction.data_types import TargetTextCollection
from target_extraction.dataset_parsers import semeval_2014
from target_extraction.tokenizers import spacy_tokenizer
from target_extraction.allen import AllenNLPModel

semeval_2014_dir = Path(
    '..',
    'original_target_datasets',
    'semeval_2014',
).resolve()
train_fp = Path(semeval_2014_dir,
                "SemEval'14-ABSA-TrainData_v2 & AnnotationGuidelines",
                "Laptop_Train_v2.xml")
test_fp = Path(semeval_2014_dir, "ABSA_Gold_TestData", 'Laptops_Test_Gold.xml')

train_data = semeval_2014(train_fp, False)
test_data = semeval_2014(test_fp, False)

test_size = len(test_data)
print(f'Size of train {len(train_data)}, size of test {test_size}')

train_data = list(train_data.values())
train_data, val_data = train_test_split(train_data, test_size=test_size)
train_data = TargetTextCollection(train_data)
val_data = TargetTextCollection(val_data)

datasets = [train_data, val_data, test_data]
tokenizer = spacy_tokenizer()
sizes = []
for dataset in datasets:
    dataset.tokenize(tokenizer)
Esempio n. 8
0
            value['target'] = value['text'][target_spans[0][0]: target_spans[0][1]] 
            new_dong_dataset.append(bella.data_types.Target(**value))
        dataset = bella.data_types.TargetCollection(new_dong_dataset)
    else:
        dataset = parsers.semeval_14(dataset_fp_mapper[f'{dataset_name} train'])
    assert dataset is not None
    dataset_size = len(dataset)
    test_split_size = size_of_small / dataset_size
    _, small_dataset = bella.data_types.TargetCollection.split_dataset(dataset, test_split_size, 
                                                                       random=False)
    assert len(small_dataset) == size_of_small
    write_data.semeval_14(dataset_fp, small_dataset)
    print(f'Number of targets in {dataset_name} non-neural training dataset {len(small_dataset)}')


    neural_small_dataset = semeval_2014(dataset_fp, conflict=False)
    # Just making sure each sentence contains at least one target.
    assert neural_small_dataset.one_sample_per_span(remove_empty=True).number_targets() == neural_small_dataset.number_targets()
    assert len(neural_small_dataset) == len(neural_small_dataset.samples_with_targets())
    print(f'Number of targets with new format {neural_small_dataset.number_targets()}')
    # For reproducibility reasons
    random_state = 42
    # Validation size is 20% of the training set size based on sentences not
    # targets
    train_size = len(neural_small_dataset)
    val_size = int(train_size * 0.2)
    train_dataset = list(neural_small_dataset.values())
    train, val = train_test_split(train_dataset, test_size=val_size, 
                                  random_state=random_state)
    train_dataset = target_extraction.data_types.TargetTextCollection(train)
    val_dataset = target_extraction.data_types.TargetTextCollection(val)