def sample_factory(job_postings, name='asample', storage=None): if not storage: storage = InMemoryStore() storage.write( '\n'.encode('utf-8').join(json.dumps(job_posting).encode('utf-8') for job_posting in job_postings), name ) return Sample(storage, name)
def sample_lookup(self): if 'sample_base_path' not in self.metadata or 'sample_name' not in self.metadata: raise ValueError( 'Sample information needs to be available to look up sample. Have you run .start on this BratExperiment yet?' ) sample = Sample(store_from_path(self.metadata['sample_base_path']), self.metadata['sample_name']) lookup = {} for line in sample: obj = json.loads(line) lookup[obj['id']] = obj return lookup
def test_BratExperiment_start(): # create a bucket that will contain both the source samples and BRAT config s3 = boto3.resource('s3') bucket = s3.create_bucket(Bucket='test-bucket') # create a sample. # sample format is one file, one job posting per line, in common schema JSON format bucket.put_object(Body='\n'.join( json.dumps({ 'id': i, 'description': str(i) }) for i in range(100, 200)), Key='samples/300_weighted') experiment = BratExperiment(experiment_name='initial_skills_tag', brat_s3_path='test-bucket/brat') experiment.start(sample=Sample(base_path='s3://test-bucket/samples', sample_name='300_weighted'), minimum_annotations_per_posting=2, max_postings_per_allocation=20, entities_with_shortcuts=(('c', 'Competency'), )) # find metadata about what it created s3 = s3fs.S3FileSystem() # first assert that some shallow metadata was passed through assert experiment.metadata[ 'sample_base_path'] == 's3://test-bucket/samples' assert experiment.metadata['sample_name'] == '300_weighted' assert experiment.metadata['entities_with_shortcuts'] == (('c', 'Competency'), ) assert experiment.metadata['minimum_annotations_per_posting'] == 2 assert experiment.metadata['max_postings_per_allocation'] == 20 # next look at the posting texts themselves. # we expect them all of them to be present but split across a number of units units = experiment.metadata['units'] assert len(units) == 5 # 100/20 retrieved_descriptions = [] for unit_name, documents in units.items(): for posting_key, original_job_id in documents: # we should not expose the original posting ids # otherwise we don't care what the keys are but that they exist where we expect them to assert posting_key is not original_job_id with s3.open('{data_path}/.{unit_name}/{posting_key}.txt'.format( data_path=experiment.data_path, unit_name=unit_name, posting_key=posting_key), mode='rb') as f: posting = f.read().decode('utf-8') retrieved_descriptions.append(posting.strip()) # make sure that the blank annotation file is there too with s3.open('{data_path}/.{unit_name}/{posting_key}.ann'.format( data_path=experiment.data_path, unit_name=unit_name, posting_key=posting_key), mode='rb') as f: assert len(f.read().decode('utf-8')) == 0 # our fake descriptions were just the string values for the range numbers # so that's what should get written assert sorted(retrieved_descriptions) == sorted( [str(i) for i in range(100, 200)]) def assert_conf_contains(conf_name, expected): with s3.open( '{path}/{conf_name}'.format(path=experiment.brat_config_path, conf_name=conf_name), 'rb') as f: assert expected in f.read().decode('utf-8') assert_conf_contains('visual.conf', '[labels]\nCompetency\n') assert_conf_contains('annotation.conf', '[entities]\nCompetency\n') assert_conf_contains('kb_shortcuts.conf', 'c Competency\n')
def test_BratExperiment_add_allocation(): # given a user name # find the next allocation to use that the user has not annotated yet # create a directory with the users name # record in metadata the fact that the user has been allocated this # setup: create a bucket for the brat config s3 = boto3.resource('s3') bucket = s3.create_bucket(Bucket='test-bucket') bucket.put_object(Body='\n'.join( json.dumps({ 'id': i, 'description': str(i) }) for i in range(100, 200)), Key='samples/300_weighted') experiment = BratExperiment(experiment_name='initial_skills_tag', brat_s3_path='test-bucket/brat') experiment.start(sample=Sample(base_path='s3://test-bucket/samples', sample_name='300_weighted'), minimum_annotations_per_posting=2, max_postings_per_allocation=20, entities_with_shortcuts=(('c', 'Competency'), )) # initialize the experiment in this bucket experiment = BratExperiment(experiment_name='initial_skills_tag', brat_s3_path='test-bucket/brat') username = '******' # should not be able to allocate without creating a user with pytest.raises(ValueError): experiment.add_allocation(username) # set up a user to allocate to experiment.user_pw_store[username] = 'password' experiment.user_pw_store.save() allocated_directory = experiment.add_allocation(username) allocations = experiment.metadata['allocations'][username] assert len(allocations) == 1 s3 = s3fs.S3FileSystem() filenames = s3.ls(allocated_directory) # there should be two files for each job posting: the .txt. and the .ann assert len(filenames) == len( experiment.metadata['units'][allocations[0]]) * 2 # simulate continued allocation with more users user_two = 'user_two' user_three = 'user_three' experiment.add_user(user_two, 'pass') experiment.add_user(user_three, 'pass') for i in range(0, 4): experiment.add_allocation(user_two) experiment.add_allocation(user_three) # at this point, trying to re-allocate to either user two or three # should fail as they have now tagged everything with pytest.raises(ValueError): experiment.add_allocation(user_two) # user one should still work for now for i in range(0, 4): new_directory = experiment.add_allocation(username) assert new_directory != allocated_directory # once they have seen the whole thing, no more! with pytest.raises(ValueError): experiment.add_allocation(username)
if __name__ == '__main__': logging.basicConfig(level=logging.INFO) n_jobs = 3 sample_names = ['samples_300_v1', 'samples_10k_v1'] skill_extractor_classes = [ FuzzyMatchSkillExtractor, ExactMatchSkillExtractor, ] sample_path = 's3://{}/sampled_jobpostings'.format(PRIVATE_BUCKET) candidates_path = '{}/skill_candidates'.format(PRIVATE_BUCKET) full_onet = Onet() ontologies = [ full_onet.filter_by( lambda edge: 'Knowledge' in edge.competency.categories, competency_name='onet_knowledge', competency_description='ONET Knowledge'), full_onet.filter_by( lambda edge: 'Ability' in edge.competency.categories, competency_name='onet_ability', competency_description='ONET Ability'), full_onet.filter_by(lambda edge: 'Skill' in edge.competency.categories, competency_name='onet_skill', competency_description='ONET Skill') ] for sample_name, skill_extractor_class, ontology in product( sample_names, skill_extractor_classes, ontologies): sample = Sample(sample_path, sample_name) skill_extractor = skill_extractor_class(ontology.competency_framework) generate_skill_candidates_oneprocess(candidates_path, sample, skill_extractor)