Example #1
0
    def setUp(self):
        client = boto3.resource('s3')
        bucket = client.create_bucket(Bucket='test-bucket')
        storage = S3Store('s3://test-bucket/computed_properties')
        description = 'This is my description'

        class MockClassifier(object):
            def predict_soc(self, document):
                assert document.strip() == description.lower()
                return '11-1234.00'

            @property
            def name(self):
                return "MockClassifier"

            @property
            def description(self):
                return "fake algorithm"

        self.computed_property = SOCClassifyProperty(
            storage=storage,
            classifier_obj=MockClassifier(),
        )
        self.job_postings = [
            utils.job_posting_factory(datePosted=self.datestring,
                                      description=description,
                                      skills='',
                                      qualifications='',
                                      experienceRequirements='')
        ]
        self.computed_property.compute_on_collection(self.job_postings)
    def setUp(self):
        s3_conn = boto.connect_s3()
        client = boto3.resource('s3')
        bucket = client.create_bucket(Bucket='test-bucket')
        description = 'This is my description'

        class MockClassifier(object):
            def predict_soc(self, document, mode):
                assert document.strip() == description.lower()
                assert mode == 'top'
                return '11-1234.00'

        self.computed_property = ClassifyTop(
            s3_conn=s3_conn,
            classifier_obj=MockClassifier(),
            path='test-bucket/computed_properties',
        )
        self.job_postings = [
            utils.job_posting_factory(datePosted=self.datestring,
                                      description=description,
                                      skills='',
                                      qualifications='',
                                      experienceRequirements='')
        ]
        self.computed_property.compute_on_collection(self.job_postings)
Example #3
0
def standard_sample():
    job_postings = [
        job_posting_factory(
            description='this is a job that requires communication skills')
        for _ in range(0, 5)
    ]
    sample = sample_factory(job_postings, name='mysample')
    return sample
 def setUp(self):
     self.client = boto3.resource('s3')
     self.client.create_bucket(Bucket='test-bucket')
     self.computed_property = PostingIdPresent(
         path='test-bucket/computed_properties')
     self.job_postings = [
         utils.job_posting_factory(datePosted=self.datestring)
     ]
     self.computed_property.compute_on_collection(self.job_postings)
 def setUp(self):
     self.client = boto3.resource('s3')
     self.client.create_bucket(Bucket='test-bucket')
     self.computed_property = TitleCleanPhaseOne(
         path='test-bucket/computed_properties')
     self.job_postings = [
         utils.job_posting_factory(datePosted=self.datestring,
                                   title='Software Engineer - Tulsa')
     ]
     self.computed_property.compute_on_collection(self.job_postings)
Example #6
0
 def setUp(self):
     self.client = boto3.resource('s3')
     self.client.create_bucket(Bucket='test-bucket')
     self.storage = S3Store('s3://test-bucket/computed_properties')
     self.computed_property = YearlyPay(self.storage)
     self.job_postings = [
         utils.job_posting_factory(id=5,
                                   datePosted=self.datestring,
                                   baseSalary={
                                       'salaryFrequency': 'yearly',
                                       'minValue': 5,
                                       'maxValue': ''
                                   }),
         utils.job_posting_factory(id=6,
                                   datePosted=self.datestring,
                                   baseSalary={
                                       'salaryFrequency': 'yearly',
                                       'minValue': '6.25',
                                       'maxValue': '9.25'
                                   })
     ]
     self.computed_property.compute_on_collection(self.job_postings)
 def setUp(self):
     s3_conn = boto.connect_s3()
     client = boto3.resource('s3')
     bucket = client.create_bucket(Bucket='test-bucket')
     skills_path = 's3://test-bucket/skills_master_table.tsv'
     utils.create_skills_file(skills_path)
     self.computed_property = ExactMatchSkillCounts(
         skill_lookup_path=skills_path,
         path='test-bucket/computed_properties',
     )
     self.job_postings = [
         utils.job_posting_factory(datePosted=self.datestring,
                                   description='reading comprehension')
     ]
     self.computed_property.compute_on_collection(self.job_postings)
Example #8
0
 def setUp(self):
     s3_conn = boto.connect_s3()
     client = boto3.resource('s3')
     bucket = client.create_bucket(Bucket='test-bucket')
     storage = S3Store('s3://test-bucket/computed_properties')
     skill_extractor = ExactMatchSkillExtractor(utils.sample_framework())
     self.computed_property = SkillCounts(
         skill_extractor=skill_extractor,
         storage=storage,
     )
     self.job_postings = [
         utils.job_posting_factory(datePosted=self.datestring,
                                   description='reading comprehension')
     ]
     self.computed_property.compute_on_collection(self.job_postings)
 def setUp(self):
     self.client = boto3.resource('s3')
     self.client.create_bucket(Bucket='test-bucket')
     self.computed_property = TitleCleanPhaseTwo(
         path='test-bucket/computed_properties')
     self.job_postings = [
         utils.job_posting_factory(datePosted=self.datestring,
                                   title='Software Engineer Tulsa')
     ]
     with patch(
             'skills_ml.algorithms.jobtitle_cleaner.clean.negative_positive_dict',
             return_value={
                 'places': ['tulsa'],
                 'states': [],
                 'onetjobs': ['software engineer']
             }):
         self.computed_property.compute_on_collection(self.job_postings)
Example #10
0
    def setUp(self):
        client = boto3.resource('s3')
        bucket = client.create_bucket(Bucket='test-bucket')
        storage = S3Store('s3://test-bucket/computed_properties')
        cache_storage = S3Store('s3://test-bucket')

        class SampleJobGeoQuerier(JobGeographyQuerier):
            name = 'blah'
            output_columns = (('city', 'the city'), )

            def _query(self, job_posting):
                return ['Fargo']

        self.computed_property = Geography(
            geo_querier=SampleJobGeoQuerier(),
            storage=storage,
        )
        self.job_postings = [
            utils.job_posting_factory(datePosted=self.datestring)
        ]
        self.computed_property.compute_on_collection(self.job_postings)
 def setUp(self):
     client = boto3.resource('s3')
     bucket = client.create_bucket(Bucket='test-bucket')
     sample_cbsa_cache = {
         'AMENIA, North Dakota': ['22020', 'Fargo, ND-MN Metro Area']
     }
     bucket.put_object(Key='cbsas.json', Body=json.dumps(sample_cbsa_cache))
     self.computed_property = CBSAandStateFromGeocode(
         cache_s3_path='test-bucket/cbsas',
         path='test-bucket/computed_properties',
     )
     self.job_postings = [
         utils.job_posting_factory(datePosted=self.datestring,
                                   jobLocation={
                                       "@type": "Place",
                                       "address": {
                                           "addressLocality": "AMENIA",
                                           "addressRegion": "ND",
                                           "@type": "PostalAddress"
                                       }
                                   })
     ]
     self.computed_property.compute_on_collection(self.job_postings)
Example #12
0
def test_BratExperiment_start():
    # create a bucket that will contain both the source samples and BRAT config
    s3 = boto3.resource('s3')
    bucket = s3.create_bucket(Bucket='test-bucket')
    storage = S3Store('s3://test-bucket/samples')

    # create a sample.
    # sample format is one file, one job posting per line, in common schema JSON format
    job_postings = [job_posting_factory(
        id=i,
        description=str(i),
        experienceRequirements='',
        qualifications='',
        skills=''
    ) for i in range(100, 200)]
    sample = sample_factory(job_postings, name='300_weighted', storage=storage)

    experiment = BratExperiment(
        experiment_name='initial_skills_tag',
        brat_s3_path='test-bucket/brat'
    )
    experiment.start(
        sample=sample,
        minimum_annotations_per_posting=2,
        max_postings_per_allocation=20,
        entities_with_shortcuts=(
            ('c', 'Competency'),
        )
    )

    # find metadata about what it created
    s3 = s3fs.S3FileSystem()

    # first assert that some shallow metadata was passed through
    assert experiment.metadata['sample_base_path'] == 's3://test-bucket/samples'
    assert experiment.metadata['sample_name'] == '300_weighted'
    assert experiment.metadata['entities_with_shortcuts'] == (('c', 'Competency'),)
    assert experiment.metadata['minimum_annotations_per_posting'] == 2
    assert experiment.metadata['max_postings_per_allocation'] == 20

    # next look at the posting texts themselves.
    # we expect them all of them to be present but split across a number of units
    units = experiment.metadata['units']
    assert len(units) == 5  # 100/20
    retrieved_descriptions = []
    for unit_name, documents in units.items():
        for posting_key, original_job_id in documents:
            # we should not expose the original posting ids
            # otherwise we don't care what the keys are but that they exist where we expect them to
            assert posting_key is not original_job_id
            with s3.open('{data_path}/.{unit_name}/{posting_key}.txt'.format(
                    data_path=experiment.data_path,
                    unit_name=unit_name,
                    posting_key=posting_key
            ), mode='rb') as f:
                posting = f.read().decode('utf-8')
                retrieved_descriptions.append(posting.strip())
            # make sure that the blank annotation file is there too
            with s3.open('{data_path}/.{unit_name}/{posting_key}.ann'.format(
                    data_path=experiment.data_path,
                    unit_name=unit_name,
                    posting_key=posting_key
            ), mode='rb') as f:
                assert len(f.read().decode('utf-8')) == 0
    # our fake descriptions were just the string values for the range numbers
    # so that's what should get written
    assert sorted(retrieved_descriptions) == sorted([str(i) for i in range(100, 200)])

    def assert_conf_contains(conf_name, expected):
        with s3.open('{path}/{conf_name}'.format(
                path=experiment.brat_config_path,
                conf_name=conf_name
        ), 'rb') as f:
            assert expected in f.read().decode('utf-8')

    assert_conf_contains('visual.conf', '[labels]\nCompetency\n')
    assert_conf_contains('annotation.conf', '[entities]\nCompetency\n')
    assert_conf_contains('kb_shortcuts.conf', 'c Competency\n')
Example #13
0
def test_BratExperiment_add_allocation():
    # given a user name
    # find the next allocation to use that the user has not annotated yet
    # create a directory with the users name
    # record in metadata the fact that the user has been allocated this

    # setup: create a bucket for the brat config
    s3 = boto3.resource('s3')
    storage = S3Store('s3://test-bucket/samples')
    s3.create_bucket(Bucket='test-bucket')
    job_postings = [job_posting_factory(id=i, description=str(i)) for i in range(100, 200)]
    sample = sample_factory(job_postings, name='300_weighted', storage=storage)

    experiment = BratExperiment(
        experiment_name='initial_skills_tag',
        brat_s3_path='test-bucket/brat'
    )
    experiment.start(
        sample=sample,
        minimum_annotations_per_posting=2,
        max_postings_per_allocation=20,
        entities_with_shortcuts=(
            ('c', 'Competency'),
        )
    )
    # initialize the experiment in this bucket
    experiment = BratExperiment(
        experiment_name='initial_skills_tag',
        brat_s3_path='test-bucket/brat'
    )

    username = '******'
    # should not be able to allocate without creating a user
    with pytest.raises(ValueError):
        experiment.add_allocation(username)

    # set up a user to allocate to
    experiment.user_pw_store[username] = 'password'
    experiment.user_pw_store.save()
    allocated_directory = experiment.add_allocation(username)

    allocations = experiment.metadata['allocations'][username]
    assert len(allocations) == 1

    s3 = s3fs.S3FileSystem()
    filenames = s3.ls(allocated_directory)
    # there should be two files for each job posting: the .txt. and the .ann
    assert len(filenames) == len(experiment.metadata['units'][allocations[0]]) * 2

    # simulate continued allocation with more users
    user_two = 'user_two'
    user_three = 'user_three'
    experiment.add_user(user_two, 'pass')
    experiment.add_user(user_three, 'pass')
    for i in range(0, 4):
        experiment.add_allocation(user_two)
        experiment.add_allocation(user_three)
    # at this point, trying to re-allocate to either user two or three
    # should fail as they have now tagged everything
    with pytest.raises(ValueError):
        experiment.add_allocation(user_two)

    # user one should still work for now
    for i in range(0, 4):
        new_directory = experiment.add_allocation(username)
        assert new_directory != allocated_directory

    # once they have seen the whole thing, no more!
    with pytest.raises(ValueError):
        experiment.add_allocation(username)