Example #1
0
def sample_factory(job_postings, name='asample', storage=None):
    if not storage:
        storage = InMemoryStore()
    storage.write(
        '\n'.encode('utf-8').join(json.dumps(job_posting).encode('utf-8') for job_posting in job_postings),
        name
    )
    return Sample(storage, name)
Example #2
0
 def sample_lookup(self):
     if 'sample_base_path' not in self.metadata or 'sample_name' not in self.metadata:
         raise ValueError(
             'Sample information needs to be available to look up sample. Have you run .start on this BratExperiment yet?'
         )
     sample = Sample(store_from_path(self.metadata['sample_base_path']),
                     self.metadata['sample_name'])
     lookup = {}
     for line in sample:
         obj = json.loads(line)
         lookup[obj['id']] = obj
     return lookup
Example #3
0
def test_BratExperiment_start():
    # create a bucket that will contain both the source samples and BRAT config
    s3 = boto3.resource('s3')
    bucket = s3.create_bucket(Bucket='test-bucket')

    # create a sample.
    # sample format is one file, one job posting per line, in common schema JSON format
    bucket.put_object(Body='\n'.join(
        json.dumps({
            'id': i,
            'description': str(i)
        }) for i in range(100, 200)),
                      Key='samples/300_weighted')

    experiment = BratExperiment(experiment_name='initial_skills_tag',
                                brat_s3_path='test-bucket/brat')
    experiment.start(sample=Sample(base_path='s3://test-bucket/samples',
                                   sample_name='300_weighted'),
                     minimum_annotations_per_posting=2,
                     max_postings_per_allocation=20,
                     entities_with_shortcuts=(('c', 'Competency'), ))

    # find metadata about what it created
    s3 = s3fs.S3FileSystem()

    # first assert that some shallow metadata was passed through
    assert experiment.metadata[
        'sample_base_path'] == 's3://test-bucket/samples'
    assert experiment.metadata['sample_name'] == '300_weighted'
    assert experiment.metadata['entities_with_shortcuts'] == (('c',
                                                               'Competency'), )
    assert experiment.metadata['minimum_annotations_per_posting'] == 2
    assert experiment.metadata['max_postings_per_allocation'] == 20

    # next look at the posting texts themselves.
    # we expect them all of them to be present but split across a number of units
    units = experiment.metadata['units']
    assert len(units) == 5  # 100/20
    retrieved_descriptions = []
    for unit_name, documents in units.items():
        for posting_key, original_job_id in documents:
            # we should not expose the original posting ids
            # otherwise we don't care what the keys are but that they exist where we expect them to
            assert posting_key is not original_job_id
            with s3.open('{data_path}/.{unit_name}/{posting_key}.txt'.format(
                    data_path=experiment.data_path,
                    unit_name=unit_name,
                    posting_key=posting_key),
                         mode='rb') as f:
                posting = f.read().decode('utf-8')
                retrieved_descriptions.append(posting.strip())
            # make sure that the blank annotation file is there too
            with s3.open('{data_path}/.{unit_name}/{posting_key}.ann'.format(
                    data_path=experiment.data_path,
                    unit_name=unit_name,
                    posting_key=posting_key),
                         mode='rb') as f:
                assert len(f.read().decode('utf-8')) == 0
    # our fake descriptions were just the string values for the range numbers
    # so that's what should get written
    assert sorted(retrieved_descriptions) == sorted(
        [str(i) for i in range(100, 200)])

    def assert_conf_contains(conf_name, expected):
        with s3.open(
                '{path}/{conf_name}'.format(path=experiment.brat_config_path,
                                            conf_name=conf_name), 'rb') as f:
            assert expected in f.read().decode('utf-8')

    assert_conf_contains('visual.conf', '[labels]\nCompetency\n')
    assert_conf_contains('annotation.conf', '[entities]\nCompetency\n')
    assert_conf_contains('kb_shortcuts.conf', 'c Competency\n')
Example #4
0
def test_BratExperiment_add_allocation():
    # given a user name
    # find the next allocation to use that the user has not annotated yet
    # create a directory with the users name
    # record in metadata the fact that the user has been allocated this

    # setup: create a bucket for the brat config
    s3 = boto3.resource('s3')
    bucket = s3.create_bucket(Bucket='test-bucket')
    bucket.put_object(Body='\n'.join(
        json.dumps({
            'id': i,
            'description': str(i)
        }) for i in range(100, 200)),
                      Key='samples/300_weighted')

    experiment = BratExperiment(experiment_name='initial_skills_tag',
                                brat_s3_path='test-bucket/brat')
    experiment.start(sample=Sample(base_path='s3://test-bucket/samples',
                                   sample_name='300_weighted'),
                     minimum_annotations_per_posting=2,
                     max_postings_per_allocation=20,
                     entities_with_shortcuts=(('c', 'Competency'), ))
    # initialize the experiment in this bucket
    experiment = BratExperiment(experiment_name='initial_skills_tag',
                                brat_s3_path='test-bucket/brat')

    username = '******'
    # should not be able to allocate without creating a user
    with pytest.raises(ValueError):
        experiment.add_allocation(username)

    # set up a user to allocate to
    experiment.user_pw_store[username] = 'password'
    experiment.user_pw_store.save()
    allocated_directory = experiment.add_allocation(username)

    allocations = experiment.metadata['allocations'][username]
    assert len(allocations) == 1

    s3 = s3fs.S3FileSystem()
    filenames = s3.ls(allocated_directory)
    # there should be two files for each job posting: the .txt. and the .ann
    assert len(filenames) == len(
        experiment.metadata['units'][allocations[0]]) * 2

    # simulate continued allocation with more users
    user_two = 'user_two'
    user_three = 'user_three'
    experiment.add_user(user_two, 'pass')
    experiment.add_user(user_three, 'pass')
    for i in range(0, 4):
        experiment.add_allocation(user_two)
        experiment.add_allocation(user_three)
    # at this point, trying to re-allocate to either user two or three
    # should fail as they have now tagged everything
    with pytest.raises(ValueError):
        experiment.add_allocation(user_two)

    # user one should still work for now
    for i in range(0, 4):
        new_directory = experiment.add_allocation(username)
        assert new_directory != allocated_directory

    # once they have seen the whole thing, no more!
    with pytest.raises(ValueError):
        experiment.add_allocation(username)
Example #5
0
if __name__ == '__main__':
    logging.basicConfig(level=logging.INFO)
    n_jobs = 3
    sample_names = ['samples_300_v1', 'samples_10k_v1']
    skill_extractor_classes = [
        FuzzyMatchSkillExtractor,
        ExactMatchSkillExtractor,
    ]
    sample_path = 's3://{}/sampled_jobpostings'.format(PRIVATE_BUCKET)
    candidates_path = '{}/skill_candidates'.format(PRIVATE_BUCKET)
    full_onet = Onet()
    ontologies = [
        full_onet.filter_by(
            lambda edge: 'Knowledge' in edge.competency.categories,
            competency_name='onet_knowledge',
            competency_description='ONET Knowledge'),
        full_onet.filter_by(
            lambda edge: 'Ability' in edge.competency.categories,
            competency_name='onet_ability',
            competency_description='ONET Ability'),
        full_onet.filter_by(lambda edge: 'Skill' in edge.competency.categories,
                            competency_name='onet_skill',
                            competency_description='ONET Skill')
    ]
    for sample_name, skill_extractor_class, ontology in product(
            sample_names, skill_extractor_classes, ontologies):
        sample = Sample(sample_path, sample_name)
        skill_extractor = skill_extractor_class(ontology.competency_framework)
        generate_skill_candidates_oneprocess(candidates_path, sample,
                                             skill_extractor)