def setUp(self): client = boto3.resource('s3') bucket = client.create_bucket(Bucket='test-bucket') storage = S3Store('s3://test-bucket/computed_properties') description = 'This is my description' class MockClassifier(object): def predict_soc(self, document): assert document.strip() == description.lower() return '11-1234.00' @property def name(self): return "MockClassifier" @property def description(self): return "fake algorithm" self.computed_property = SOCClassifyProperty( storage=storage, classifier_obj=MockClassifier(), ) self.job_postings = [ utils.job_posting_factory(datePosted=self.datestring, description=description, skills='', qualifications='', experienceRequirements='') ] self.computed_property.compute_on_collection(self.job_postings)
def setUp(self): s3_conn = boto.connect_s3() client = boto3.resource('s3') bucket = client.create_bucket(Bucket='test-bucket') description = 'This is my description' class MockClassifier(object): def predict_soc(self, document, mode): assert document.strip() == description.lower() assert mode == 'top' return '11-1234.00' self.computed_property = ClassifyTop( s3_conn=s3_conn, classifier_obj=MockClassifier(), path='test-bucket/computed_properties', ) self.job_postings = [ utils.job_posting_factory(datePosted=self.datestring, description=description, skills='', qualifications='', experienceRequirements='') ] self.computed_property.compute_on_collection(self.job_postings)
def standard_sample(): job_postings = [ job_posting_factory( description='this is a job that requires communication skills') for _ in range(0, 5) ] sample = sample_factory(job_postings, name='mysample') return sample
def setUp(self): self.client = boto3.resource('s3') self.client.create_bucket(Bucket='test-bucket') self.computed_property = PostingIdPresent( path='test-bucket/computed_properties') self.job_postings = [ utils.job_posting_factory(datePosted=self.datestring) ] self.computed_property.compute_on_collection(self.job_postings)
def setUp(self): self.client = boto3.resource('s3') self.client.create_bucket(Bucket='test-bucket') self.computed_property = TitleCleanPhaseOne( path='test-bucket/computed_properties') self.job_postings = [ utils.job_posting_factory(datePosted=self.datestring, title='Software Engineer - Tulsa') ] self.computed_property.compute_on_collection(self.job_postings)
def setUp(self): self.client = boto3.resource('s3') self.client.create_bucket(Bucket='test-bucket') self.storage = S3Store('s3://test-bucket/computed_properties') self.computed_property = YearlyPay(self.storage) self.job_postings = [ utils.job_posting_factory(id=5, datePosted=self.datestring, baseSalary={ 'salaryFrequency': 'yearly', 'minValue': 5, 'maxValue': '' }), utils.job_posting_factory(id=6, datePosted=self.datestring, baseSalary={ 'salaryFrequency': 'yearly', 'minValue': '6.25', 'maxValue': '9.25' }) ] self.computed_property.compute_on_collection(self.job_postings)
def setUp(self): s3_conn = boto.connect_s3() client = boto3.resource('s3') bucket = client.create_bucket(Bucket='test-bucket') skills_path = 's3://test-bucket/skills_master_table.tsv' utils.create_skills_file(skills_path) self.computed_property = ExactMatchSkillCounts( skill_lookup_path=skills_path, path='test-bucket/computed_properties', ) self.job_postings = [ utils.job_posting_factory(datePosted=self.datestring, description='reading comprehension') ] self.computed_property.compute_on_collection(self.job_postings)
def setUp(self): s3_conn = boto.connect_s3() client = boto3.resource('s3') bucket = client.create_bucket(Bucket='test-bucket') storage = S3Store('s3://test-bucket/computed_properties') skill_extractor = ExactMatchSkillExtractor(utils.sample_framework()) self.computed_property = SkillCounts( skill_extractor=skill_extractor, storage=storage, ) self.job_postings = [ utils.job_posting_factory(datePosted=self.datestring, description='reading comprehension') ] self.computed_property.compute_on_collection(self.job_postings)
def setUp(self): self.client = boto3.resource('s3') self.client.create_bucket(Bucket='test-bucket') self.computed_property = TitleCleanPhaseTwo( path='test-bucket/computed_properties') self.job_postings = [ utils.job_posting_factory(datePosted=self.datestring, title='Software Engineer Tulsa') ] with patch( 'skills_ml.algorithms.jobtitle_cleaner.clean.negative_positive_dict', return_value={ 'places': ['tulsa'], 'states': [], 'onetjobs': ['software engineer'] }): self.computed_property.compute_on_collection(self.job_postings)
def setUp(self): client = boto3.resource('s3') bucket = client.create_bucket(Bucket='test-bucket') storage = S3Store('s3://test-bucket/computed_properties') cache_storage = S3Store('s3://test-bucket') class SampleJobGeoQuerier(JobGeographyQuerier): name = 'blah' output_columns = (('city', 'the city'), ) def _query(self, job_posting): return ['Fargo'] self.computed_property = Geography( geo_querier=SampleJobGeoQuerier(), storage=storage, ) self.job_postings = [ utils.job_posting_factory(datePosted=self.datestring) ] self.computed_property.compute_on_collection(self.job_postings)
def setUp(self): client = boto3.resource('s3') bucket = client.create_bucket(Bucket='test-bucket') sample_cbsa_cache = { 'AMENIA, North Dakota': ['22020', 'Fargo, ND-MN Metro Area'] } bucket.put_object(Key='cbsas.json', Body=json.dumps(sample_cbsa_cache)) self.computed_property = CBSAandStateFromGeocode( cache_s3_path='test-bucket/cbsas', path='test-bucket/computed_properties', ) self.job_postings = [ utils.job_posting_factory(datePosted=self.datestring, jobLocation={ "@type": "Place", "address": { "addressLocality": "AMENIA", "addressRegion": "ND", "@type": "PostalAddress" } }) ] self.computed_property.compute_on_collection(self.job_postings)
def test_BratExperiment_start(): # create a bucket that will contain both the source samples and BRAT config s3 = boto3.resource('s3') bucket = s3.create_bucket(Bucket='test-bucket') storage = S3Store('s3://test-bucket/samples') # create a sample. # sample format is one file, one job posting per line, in common schema JSON format job_postings = [job_posting_factory( id=i, description=str(i), experienceRequirements='', qualifications='', skills='' ) for i in range(100, 200)] sample = sample_factory(job_postings, name='300_weighted', storage=storage) experiment = BratExperiment( experiment_name='initial_skills_tag', brat_s3_path='test-bucket/brat' ) experiment.start( sample=sample, minimum_annotations_per_posting=2, max_postings_per_allocation=20, entities_with_shortcuts=( ('c', 'Competency'), ) ) # find metadata about what it created s3 = s3fs.S3FileSystem() # first assert that some shallow metadata was passed through assert experiment.metadata['sample_base_path'] == 's3://test-bucket/samples' assert experiment.metadata['sample_name'] == '300_weighted' assert experiment.metadata['entities_with_shortcuts'] == (('c', 'Competency'),) assert experiment.metadata['minimum_annotations_per_posting'] == 2 assert experiment.metadata['max_postings_per_allocation'] == 20 # next look at the posting texts themselves. # we expect them all of them to be present but split across a number of units units = experiment.metadata['units'] assert len(units) == 5 # 100/20 retrieved_descriptions = [] for unit_name, documents in units.items(): for posting_key, original_job_id in documents: # we should not expose the original posting ids # otherwise we don't care what the keys are but that they exist where we expect them to assert posting_key is not original_job_id with s3.open('{data_path}/.{unit_name}/{posting_key}.txt'.format( data_path=experiment.data_path, unit_name=unit_name, posting_key=posting_key ), mode='rb') as f: posting = f.read().decode('utf-8') retrieved_descriptions.append(posting.strip()) # make sure that the blank annotation file is there too with s3.open('{data_path}/.{unit_name}/{posting_key}.ann'.format( data_path=experiment.data_path, unit_name=unit_name, posting_key=posting_key ), mode='rb') as f: assert len(f.read().decode('utf-8')) == 0 # our fake descriptions were just the string values for the range numbers # so that's what should get written assert sorted(retrieved_descriptions) == sorted([str(i) for i in range(100, 200)]) def assert_conf_contains(conf_name, expected): with s3.open('{path}/{conf_name}'.format( path=experiment.brat_config_path, conf_name=conf_name ), 'rb') as f: assert expected in f.read().decode('utf-8') assert_conf_contains('visual.conf', '[labels]\nCompetency\n') assert_conf_contains('annotation.conf', '[entities]\nCompetency\n') assert_conf_contains('kb_shortcuts.conf', 'c Competency\n')
def test_BratExperiment_add_allocation(): # given a user name # find the next allocation to use that the user has not annotated yet # create a directory with the users name # record in metadata the fact that the user has been allocated this # setup: create a bucket for the brat config s3 = boto3.resource('s3') storage = S3Store('s3://test-bucket/samples') s3.create_bucket(Bucket='test-bucket') job_postings = [job_posting_factory(id=i, description=str(i)) for i in range(100, 200)] sample = sample_factory(job_postings, name='300_weighted', storage=storage) experiment = BratExperiment( experiment_name='initial_skills_tag', brat_s3_path='test-bucket/brat' ) experiment.start( sample=sample, minimum_annotations_per_posting=2, max_postings_per_allocation=20, entities_with_shortcuts=( ('c', 'Competency'), ) ) # initialize the experiment in this bucket experiment = BratExperiment( experiment_name='initial_skills_tag', brat_s3_path='test-bucket/brat' ) username = '******' # should not be able to allocate without creating a user with pytest.raises(ValueError): experiment.add_allocation(username) # set up a user to allocate to experiment.user_pw_store[username] = 'password' experiment.user_pw_store.save() allocated_directory = experiment.add_allocation(username) allocations = experiment.metadata['allocations'][username] assert len(allocations) == 1 s3 = s3fs.S3FileSystem() filenames = s3.ls(allocated_directory) # there should be two files for each job posting: the .txt. and the .ann assert len(filenames) == len(experiment.metadata['units'][allocations[0]]) * 2 # simulate continued allocation with more users user_two = 'user_two' user_three = 'user_three' experiment.add_user(user_two, 'pass') experiment.add_user(user_three, 'pass') for i in range(0, 4): experiment.add_allocation(user_two) experiment.add_allocation(user_three) # at this point, trying to re-allocate to either user two or three # should fail as they have now tagged everything with pytest.raises(ValueError): experiment.add_allocation(user_two) # user one should still work for now for i in range(0, 4): new_directory = experiment.add_allocation(username) assert new_directory != allocated_directory # once they have seen the whole thing, no more! with pytest.raises(ValueError): experiment.add_allocation(username)