Example #1
0
def test_BratExperiment_add_user():
    # given a valid brat experiment,
    # call add_user with a username and password
    # expect the user/pass to be added to config.py,
    # and for the user to be added to the experiment's metadata
    # and for add_allocation to be called with the user

    # setup: create a bucket for the brat config
    s3 = boto3.resource('s3')
    s3.create_bucket(Bucket='test-bucket')
    # initialize the experiment in this bucket
    experiment = BratExperiment(experiment_name='initial_skills_tag',
                                brat_s3_path='test-bucket/brat')

    # setup the add_allocation mock for later inspection
    add_allocation_mock = MagicMock()
    experiment.add_allocation = add_allocation_mock

    # add a user
    experiment.add_user('user', 'pass')

    # assert metadata
    assert 'user' in experiment.user_pw_store

    # assert that we attempted to allocate postings to them
    add_allocation_mock.assert_called_with('user')
def test_BratExperiment_labels_with_agreement():
    s3 = boto3.resource('s3')
    bucket = s3.create_bucket(Bucket='test-bucket')

    experiment = BratExperiment(
        experiment_name='initial_skills_tag',
        brat_s3_path='test-bucket/brat'
    )
    labels_by_unit = {
        'unit_1': {
            '0': [
                {'entity': 'Skill', 'start_index': 44, 'end_index': 70, 'labeled_string': 'substance abuse counseling', 'percent_tagged': 1.0, 'number_seen': 2}
            ],
            '1': [
                {'entity': 'Skill', 'start_index': 16, 'end_index': 33, 'labeled_string': 'python programming', 'percent_tagged': 1.0, 'number_seen': 2},
                {'entity': 'Skill', 'start_index': 39, 'end_index': 65, 'labeled_string': 'substance abuse counseling', 'percent_tagged': 0.5, 'number_seen': 2},
                {'entity': 'Skill', 'start_index': 49, 'end_index': 65, 'labeled_string': 'abuse counseling', 'percent_tagged': 0.5, 'number_seen': 2}
            ]
        }
    }
    experiment.labels_with_agreement_by_unit = labels_by_unit
    experiment.metadata['units'] = {
        'unit_1': [
            (0, 'ABC_91238'),
            (1, 'ABC_4823943'),
        ]
    }
    experiment.metadata['sample_name'] = 'test-sample'
    experiment.metadata.save()
    assert experiment.labels_with_agreement == [
        {'job_posting_id': 'ABC_91238', 'entity': 'Skill', 'start_index': 44, 'end_index': 70, 'labeled_string': 'substance abuse counseling', 'percent_tagged': 1.0, 'number_seen': 2, 'sample_name': 'test-sample'},
        {'job_posting_id': 'ABC_4823943', 'entity': 'Skill', 'start_index': 16, 'end_index': 33, 'labeled_string': 'python programming', 'percent_tagged': 1.0, 'number_seen': 2, 'sample_name': 'test-sample'},
        {'job_posting_id': 'ABC_4823943', 'entity': 'Skill', 'start_index': 39, 'end_index': 65, 'labeled_string': 'substance abuse counseling', 'percent_tagged': 0.5, 'number_seen': 2, 'sample_name': 'test-sample'},
        {'job_posting_id': 'ABC_4823943', 'entity': 'Skill', 'start_index': 49, 'end_index': 65, 'labeled_string': 'abuse counseling', 'percent_tagged': 0.5, 'number_seen': 2, 'sample_name': 'test-sample'}
    ]
def test_BratExperiment_average_observed_agreement():
    s3 = boto3.resource('s3')
    s3.create_bucket(Bucket='test-bucket')

    experiment = BratExperiment(
        experiment_name='initial_skills_tag',
        brat_s3_path='test-bucket/brat'
    )

    annotations_by_unit = {}
    annotations_by_unit['unit_1'] = {
        '0': {
            'user_1': [
                {'entity': 'Skill', 'start_index': 44, 'end_index': 70, 'labeled_string': 'substance abuse counseling'}
            ],
            'user_2': [
                {'entity': 'Skill', 'start_index': 44, 'end_index': 70, 'labeled_string': 'substance abuse counseling'}
            ]
        },
        '1': {
            'user_1': [
                {'entity': 'Skill', 'start_index': 16, 'end_index': 33, 'labeled_string': 'python programming'},
                {'entity': 'Skill', 'start_index': 39, 'end_index': 65, 'labeled_string': 'substance abuse counseling'}
            ],
            'user_2': [
                {'entity': 'Skill', 'start_index': 16, 'end_index': 33, 'labeled_string': 'python programming'},
                {'entity': 'Skill', 'start_index': 49, 'end_index': 65, 'labeled_string': 'abuse counseling'}
            ],
        }
    }
    experiment.annotations_by_unit = annotations_by_unit
    assert experiment.average_observed_agreement() == {'unit_1': {'0': 1, '1': 2/3}}
    def test_labels_with_agreement_by_unit(self):
        s3 = boto3.resource('s3')
        bucket = s3.create_bucket(Bucket='test-bucket')

        experiment = BratExperiment(
            experiment_name='initial_skills_tag',
            brat_s3_path='test-bucket/brat'
        )

        annotations_by_unit = {}
        annotations_by_unit['unit_1'] = {
            '0': {
                'user_1': [
                    {'entity': 'Skill', 'start_index': 44, 'end_index': 70, 'labeled_string': 'substance abuse counseling'}
                ],
                'user_2': [
                    {'entity': 'Skill', 'start_index': 44, 'end_index': 70, 'labeled_string': 'substance abuse counseling'}
                ]
            },
            '1': {
                'user_1': [
                    {'entity': 'Skill', 'start_index': 16, 'end_index': 33, 'labeled_string': 'python programming'},
                    {'entity': 'Skill', 'start_index': 39, 'end_index': 65, 'labeled_string': 'substance abuse counseling'}
                ],
                'user_2': [
                    {'entity': 'Skill', 'start_index': 16, 'end_index': 33, 'labeled_string': 'python programming'},
                    {'entity': 'Skill', 'start_index': 49, 'end_index': 65, 'labeled_string': 'abuse counseling'}
                ],
            }
        }
        experiment.annotations_by_unit = annotations_by_unit
        expected = {
            'unit_1': {
                '0': [
                    {'entity': 'Skill', 'start_index': 44, 'end_index': 70, 'labeled_string': 'substance abuse counseling', 'percent_tagged': 1.0, 'number_seen': 2}
                ],
                '1': [
                    {'entity': 'Skill', 'start_index': 16, 'end_index': 33, 'labeled_string': 'python programming', 'percent_tagged': 1.0, 'number_seen': 2},
                    {'entity': 'Skill', 'start_index': 39, 'end_index': 65, 'labeled_string': 'substance abuse counseling', 'percent_tagged': 0.5, 'number_seen': 2},
                    {'entity': 'Skill', 'start_index': 49, 'end_index': 65, 'labeled_string': 'abuse counseling', 'percent_tagged': 0.5, 'number_seen': 2}
                ]
            }
        }
        self.maxDiff = None
        self.assertDictEqual(experiment.labels_with_agreement_by_unit, expected)
Example #5
0
def test_BratExperiment_annotations_by_unit():
    s3 = boto3.resource('s3')
    s3.create_bucket(Bucket='test-bucket')

    experiment = BratExperiment(experiment_name='initial_skills_tag',
                                brat_s3_path='test-bucket/brat')

    # directory structure
    # 2 different units with 2 job postings each, 2 users each
    # calculate inter-rater reliability per job-posting

    job_postings = {
        'unit_1/0':
        'this is a job description which talks about substance abuse counseling',
        'unit_1/1':
        'job description python programming and substance abuse counseling',
        'unit_2/0':
        'hello we want a person who is skilled with offender orientation and intervention techniques',
        'unit_2/1':
        'development of positive social skills is important to us and also intervention techniques'
    }
    tags = {
        'user_1': {
            'unit_1/0': [
                'T1\tSkill 44 70\tsubstance abuse counseling',
            ],
            'unit_1/1': [
                'T1\tSkill 16 33\tpython programming',
                'T2\tSkill 39 65\tsubstance abuse counseling',
            ]
        },
        'user_2': {
            'unit_1/0': [
                'T1\tSkill 44 70\tsubstance abuse counseling',
            ],
            'unit_1/1': [
                'T1\tSkill 16 33\tpython programming',
                'T2\tSkill 49 65\tabuse counseling',
            ]
        },
        'user_3': {
            'unit_2/0': [
                'T1\tSkill 43 62\toffender orientation',
                'T2\tSkill 68 90\tintervention techniques',
            ],
            'unit_2/1': [
                'T1\tSkill 0 36\tdevelopment of positive social skills',
                'T2\tSkill 66 88\tintervention techniques',
            ]
        },
        'user_4': {
            'unit_2/0': [
                'T1\tSkill 43 62\toffender orientation',
                'T2\tSkill 68 90\tintervention techniques',
            ],
            'unit_2/1': []
        },
    }
    experiment.metadata['units'] = {}
    for key in job_postings.keys():
        unit_name, num = key.split('/')
        if unit_name not in experiment.metadata['units']:
            experiment.metadata['units'][unit_name] = []
        experiment.metadata['units'][unit_name].append((key, key))

    experiment.metadata['allocations'] = {}
    for user_name, annotations in tags.items():
        experiment.metadata['allocations'][user_name] = []
        for key, annotation_lines in annotations.items():
            unit_name, num = key.split('/')
            if unit_name not in experiment.metadata['allocations'][user_name]:
                experiment.metadata['allocations'][user_name].append(unit_name)

            base_path = '{}/{}'.format(
                experiment.user_allocations_path(user_name), key)
            with experiment.s3.open('{}.txt'.format(base_path), 'wb') as f:
                f.write(job_postings[key].encode('utf-8'))
            with experiment.s3.open('{}.ann'.format(base_path), 'wb') as f:
                f.write('\n'.join(annotation_lines).encode('utf-8'))
    experiment.metadata.save()

    assert experiment.annotations_by_unit['unit_1'] == {
        '0': {
            'user_1': [{
                'entity': 'Skill',
                'start_index': 44,
                'end_index': 70,
                'labeled_string': 'substance abuse counseling'
            }],
            'user_2': [{
                'entity': 'Skill',
                'start_index': 44,
                'end_index': 70,
                'labeled_string': 'substance abuse counseling'
            }]
        },
        '1': {
            'user_1': [{
                'entity': 'Skill',
                'start_index': 16,
                'end_index': 33,
                'labeled_string': 'python programming'
            }, {
                'entity': 'Skill',
                'start_index': 39,
                'end_index': 65,
                'labeled_string': 'substance abuse counseling'
            }],
            'user_2': [{
                'entity': 'Skill',
                'start_index': 16,
                'end_index': 33,
                'labeled_string': 'python programming'
            }, {
                'entity': 'Skill',
                'start_index': 49,
                'end_index': 65,
                'labeled_string': 'abuse counseling'
            }],
        }
    }
Example #6
0
    def test_sequence_tagged_annotations(self):
        s3 = boto3.resource('s3')
        s3.create_bucket(Bucket='test-bucket')

        experiment = BratExperiment(experiment_name='initial_skills_tag',
                                    brat_s3_path='test-bucket/brat')

        tags = {
            'user_1': {
                'unit_1/0': [
                    'O\t0\t4\tthis',
                    'B-SKILL\t5\t7\tis',
                    'O\t8\t14\tpython',
                ],
                'unit_1/1': [
                    'O\t0\t4\tthis',
                    'O\t5\t7\tis',
                    'B-SKILL\t8\t14\tpython',
                ]
            },
            'user_2': {
                'unit_1/0': [
                    'O\t0\t4\tthis',
                    'O\t5\t7\tis',
                    'B-SKILL\t8\t14\tpython',
                ],
                'unit_1/1': [
                    'O\t0\t4\tthis',
                    'O\t5\t7\tis',
                    'B-SKILL\t8\t14\tpython',
                ]
            },
        }
        experiment.metadata['units'] = {
            'unit_1': [
                (0, 'ABC_91238'),
                (1, 'ABC_4823943'),
            ]
        }
        experiment.metadata['allocations'] = {}
        for user_name, annotations in tags.items():
            experiment.metadata['allocations'][user_name] = []
            for key, token_lines in annotations.items():
                unit_name, num = key.split('/')
                if unit_name not in experiment.metadata['allocations'][
                        user_name]:
                    experiment.metadata['allocations'][user_name].append(
                        unit_name)

                base_path = '{}/{}'.format(
                    experiment.user_allocations_path(user_name), key)
                with experiment.s3.open('{}.txt'.format(base_path), 'wb') as f:
                    f.write(
                        'does not matter we are not reading'.encode('utf-8'))
                with experiment.s3.open('{}.ann'.format(base_path), 'wb') as f:
                    f.write(
                        'does not matter we are not reading'.encode('utf-8'))
                with experiment.s3.open('{}.conll'.format(base_path),
                                        'wb') as f:
                    f.write('\n'.join(token_lines).encode('utf-8'))
        experiment.metadata.save()

        self.maxDiff = None
        expected_tokens = {
            ('ABC_91238', md5('user_1')): [('O', 'this'), ('B-SKILL', 'is'),
                                           ('O', 'python')],
            ('ABC_91238', md5('user_2')): [('O', 'this'), ('O', 'is'),
                                           ('B-SKILL', 'python')],
            ('ABC_4823943', md5('user_1')): [('O', 'this'), ('O', 'is'),
                                             ('B-SKILL', 'python')],
            ('ABC_4823943', md5('user_2')): [('O', 'this'), ('O', 'is'),
                                             ('B-SKILL', 'python')],
        }
        self.assertDictEqual(experiment.sequence_tagged_annotations,
                             expected_tokens)
Example #7
0
def test_BratExperiment_start():
    # create a bucket that will contain both the source samples and BRAT config
    s3 = boto3.resource('s3')
    bucket = s3.create_bucket(Bucket='test-bucket')

    # create a sample.
    # sample format is one file, one job posting per line, in common schema JSON format
    bucket.put_object(Body='\n'.join(
        json.dumps({
            'id': i,
            'description': str(i)
        }) for i in range(100, 200)),
                      Key='samples/300_weighted')

    experiment = BratExperiment(experiment_name='initial_skills_tag',
                                brat_s3_path='test-bucket/brat')
    experiment.start(sample=Sample(base_path='s3://test-bucket/samples',
                                   sample_name='300_weighted'),
                     minimum_annotations_per_posting=2,
                     max_postings_per_allocation=20,
                     entities_with_shortcuts=(('c', 'Competency'), ))

    # find metadata about what it created
    s3 = s3fs.S3FileSystem()

    # first assert that some shallow metadata was passed through
    assert experiment.metadata[
        'sample_base_path'] == 's3://test-bucket/samples'
    assert experiment.metadata['sample_name'] == '300_weighted'
    assert experiment.metadata['entities_with_shortcuts'] == (('c',
                                                               'Competency'), )
    assert experiment.metadata['minimum_annotations_per_posting'] == 2
    assert experiment.metadata['max_postings_per_allocation'] == 20

    # next look at the posting texts themselves.
    # we expect them all of them to be present but split across a number of units
    units = experiment.metadata['units']
    assert len(units) == 5  # 100/20
    retrieved_descriptions = []
    for unit_name, documents in units.items():
        for posting_key, original_job_id in documents:
            # we should not expose the original posting ids
            # otherwise we don't care what the keys are but that they exist where we expect them to
            assert posting_key is not original_job_id
            with s3.open('{data_path}/.{unit_name}/{posting_key}.txt'.format(
                    data_path=experiment.data_path,
                    unit_name=unit_name,
                    posting_key=posting_key),
                         mode='rb') as f:
                posting = f.read().decode('utf-8')
                retrieved_descriptions.append(posting.strip())
            # make sure that the blank annotation file is there too
            with s3.open('{data_path}/.{unit_name}/{posting_key}.ann'.format(
                    data_path=experiment.data_path,
                    unit_name=unit_name,
                    posting_key=posting_key),
                         mode='rb') as f:
                assert len(f.read().decode('utf-8')) == 0
    # our fake descriptions were just the string values for the range numbers
    # so that's what should get written
    assert sorted(retrieved_descriptions) == sorted(
        [str(i) for i in range(100, 200)])

    def assert_conf_contains(conf_name, expected):
        with s3.open(
                '{path}/{conf_name}'.format(path=experiment.brat_config_path,
                                            conf_name=conf_name), 'rb') as f:
            assert expected in f.read().decode('utf-8')

    assert_conf_contains('visual.conf', '[labels]\nCompetency\n')
    assert_conf_contains('annotation.conf', '[entities]\nCompetency\n')
    assert_conf_contains('kb_shortcuts.conf', 'c Competency\n')
Example #8
0
def test_BratExperiment_add_allocation():
    # given a user name
    # find the next allocation to use that the user has not annotated yet
    # create a directory with the users name
    # record in metadata the fact that the user has been allocated this

    # setup: create a bucket for the brat config
    s3 = boto3.resource('s3')
    bucket = s3.create_bucket(Bucket='test-bucket')
    bucket.put_object(Body='\n'.join(
        json.dumps({
            'id': i,
            'description': str(i)
        }) for i in range(100, 200)),
                      Key='samples/300_weighted')

    experiment = BratExperiment(experiment_name='initial_skills_tag',
                                brat_s3_path='test-bucket/brat')
    experiment.start(sample=Sample(base_path='s3://test-bucket/samples',
                                   sample_name='300_weighted'),
                     minimum_annotations_per_posting=2,
                     max_postings_per_allocation=20,
                     entities_with_shortcuts=(('c', 'Competency'), ))
    # initialize the experiment in this bucket
    experiment = BratExperiment(experiment_name='initial_skills_tag',
                                brat_s3_path='test-bucket/brat')

    username = '******'
    # should not be able to allocate without creating a user
    with pytest.raises(ValueError):
        experiment.add_allocation(username)

    # set up a user to allocate to
    experiment.user_pw_store[username] = 'password'
    experiment.user_pw_store.save()
    allocated_directory = experiment.add_allocation(username)

    allocations = experiment.metadata['allocations'][username]
    assert len(allocations) == 1

    s3 = s3fs.S3FileSystem()
    filenames = s3.ls(allocated_directory)
    # there should be two files for each job posting: the .txt. and the .ann
    assert len(filenames) == len(
        experiment.metadata['units'][allocations[0]]) * 2

    # simulate continued allocation with more users
    user_two = 'user_two'
    user_three = 'user_three'
    experiment.add_user(user_two, 'pass')
    experiment.add_user(user_three, 'pass')
    for i in range(0, 4):
        experiment.add_allocation(user_two)
        experiment.add_allocation(user_three)
    # at this point, trying to re-allocate to either user two or three
    # should fail as they have now tagged everything
    with pytest.raises(ValueError):
        experiment.add_allocation(user_two)

    # user one should still work for now
    for i in range(0, 4):
        new_directory = experiment.add_allocation(username)
        assert new_directory != allocated_directory

    # once they have seen the whole thing, no more!
    with pytest.raises(ValueError):
        experiment.add_allocation(username)